## Aprendizagem Automática - Projeto

In [7]:
# Imports
import pandas as pd

# Models and selection methods
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR
# Linear regression metrics
from sklearn.metrics import explained_variance_score, mean_squared_error, max_error, mean_absolute_error
from scipy.stats import pearsonr
from sklearn.model_selection import GridSearchCV, train_test_split

from sklearn.preprocessing import LabelBinarizer

In [8]:
# Funcoes auxiliares

def printRegStatistics(truth, preds):
    print("The RVE is: ", explained_variance_score(truth, preds))
    print("The rmse is: ", mean_squared_error(truth, preds, squared=False))
    corr, pval = pearsonr(truth, preds)
    print("The Correlation Score is is: %6.4f (p-value=%e)\n"%(corr,pval))
    print("The Maximum Error is is: ", max_error(truth, preds))
    print("The Mean Absolute Error is: ", mean_absolute_error(truth, preds))
    

#Devolve uma lista com quantos elementos há seguidos entre valores de NaN. p.e: [1,nan,2,3,nan,nan,2,nan,2,3,4] -> [1,2,1,3]
def count_not_nan(serie):
    conta=0
    lista_seguida=[]
    for i in range(len(serie)):
        if serie[i]==1:
            conta+=1
        else:
            if conta!=0:
                lista_seguida.append(conta)
                conta=0
            else:
                conta=0
                
        if i==len(serie)-1 and conta!=0:
            lista_seguida.append(conta)
    return lista_seguida

#Função que lida com os nan nas df. Se o país não tiver nenhum dado -> drop. Se o país não tiver pelo menos lag+1 dados seguidos
#-> drop. O resto fica

def drop_nan(df,lag):
    pais_nan=[]
    for pais in df["Country Name"].index:
        nan_val=list(pd.DataFrame(df.iloc[pais][4:]).count(axis=1)) #ver se os valores são nan
        if sum(nan_val)==0: #se for tudo nan
            pais_nan.append(pais)
        elif max(count_not_nan(nan_val))<=lag: #Se não houver mais do que lag valores seguidos, drop
            pais_nan.append(pais)

    df=df.drop(pais_nan, axis=0)
    df = df.reset_index(drop=True)
    return df

#Função que escolha o dataframe até um certo ano apenas, que remove os países que contenham nan nesse intervalo de tempo

def indexing(dataf, ano_i):
    X_list=dataf.drop(columns=dataf.iloc[:,[1,2,3]]+dataf.loc[:,[str(i) for i in range(1960,ano_i)]])
    X_list=X_list.T
    X_list=X_list.rename(columns=X_list.iloc[0]).drop(X_list.index[0])
    return X_list

#Cria um DataFrame com os dados de um país e respetivos lags

def lags(df, lag, país):
    new_df=pd.DataFrame()
    new_df["dados"]=df[país]
    colunas = [str(i) + "lag" for i in range(1,lag+1)]

    for i,k in enumerate(colunas):
        new_df[k] = new_df["dados"].shift(+i+1)
    new_df=new_df.dropna()
    return new_df

#Função que cria o dataset necessário para as timeseries. As time series são todas unidas numa só coluna e os países respetivos
# a essa série temporal recebem um 1. Também concatena os lags no início da df.

def time_series(df, lag):
    binario = LabelBinarizer().fit(df.columns)
    df_final= pd.DataFrame()
    for pais in df.columns:
        pais_bin=pd.DataFrame(data=list(binario.transform([pais]))*len(df[pais]),
                             columns=binario.classes_,
                             index=df.index)
        #Agora juntamos os lags e os dados respetivos
        df_mid=pd.DataFrame()
        df_mid=pd.concat((df_mid,lags(df, lag, pais),pais_bin),axis=1).dropna()

        #Finalmente, concatenamos tudo numa só DataFrame
        df_final = pd.concat((df_final, df_mid), sort=True)
        df_fina=df_final.dropna()
    return df_final

# Função para criar um dataframe X para um dado dataset, pais e ano, este dataframe depois eh utilizado para fazer predict
def createX(dataset : pd.DataFrame, country : str, year : int, lag) -> pd.DataFrame:
    previousYearCountryData = dataset.loc[(dataset[country] == 1) & (dataset.index.get_level_values(0) == str(year - 1))]
    newX = previousYearCountryData.copy()
    newX.index = [str(year)]
    for i in range(lag,2,-1):
        newX.at[str(year), str(i)+"lag"] = newX.at[str(year),str(i-1)+"lag"]
    newX.at[str(year), "1lag"] = newX.at[str(year),"dados"]
    newX.drop(["dados"], axis=1, inplace=True)
    return newX

# Função para prever dados novos para um dado dataset ano e pais a partir do modelo treinado dado
def predictAndCalculateDelta(model, dataset : pd.DataFrame, country : str, year : int, lag):
    previousYearCountryData = dataset.loc[(dataset[country] == 1) & (dataset.index.get_level_values(0) == str(year - 1))]
    newX = createX(dataset, country, year, lag)
    x = model.predict(newX)
    x_previous = previousYearCountryData.at[str(year-1),"dados"]
    delta = x - x_previous
    return x[0], delta[0]

# Vai buscar o valor para um dado ano e pais e calcula o delta
def determineRealResults(dataset : pd.DataFrame, country : str, year : int):
    real_data = dataset.at[str(year), country]
    previous_year_data = dataset.at[str(year-1), country]
    delta = real_data - previous_year_data
    return real_data, delta

# Erro relativo e absoluto
def calculateErrors(real, pred):
    abs_error = abs(pred - real)
    relative_error = abs_error / abs(real)
    return abs_error, (relative_error*100)

# Criar dataframe com os resultados finais
def DFCreate(PrevisionData, year):
    # Criar dataframe
    df = pd.DataFrame(PrevisionData, columns=['Country', str(year) + ' Real Value', str(year) + ' Prevision','Relative Error %',str(year) + ' Real Delta',str(year) + ' Delta Prevision','Relative Error %'])
    # Converter valores em notacao legivel
    df = df.loc[:, df.columns == 'Country'].join(df.loc[:, df.columns != 'Country'].applymap(lambda x: '{:,.3f}'.format(x)))
    return df

Importação dos datasets

In [9]:
# Dados 1960-2016
fertility = pd.read_csv("fertility_rate.csv")
population= pd.read_csv("country_population.csv")
expectancy = pd.read_csv("life_expectancy.csv")

fertility=drop_nan(fertility,4)
population=drop_nan(population,4)
expectancy=drop_nan(expectancy,4)

# Dados 1960-2020
fertility_2020 = pd.read_csv("fertility_2020.csv")
population_2020 = pd.read_csv("population_2020.csv")
expectancy_2020 = pd.read_csv("life_2020.csv")

#Criar os data frames
fert=indexing(fertility, 1960)
pop=indexing(population, 1960)
exp=indexing(expectancy, 1960)

fert_2020 = indexing(fertility_2020, 1960)
pop_2020 = indexing(population_2020, 1960)
exp_2020 = indexing(expectancy_2020, 1960)

Vamos continuar a transformar o nosso dataset, de forma a que fiquem todas as séries temporais numa só coluna, a referência respetiva de que país se refere a série temporal e os respetivos lags para cada valor. 

In [10]:
#Versão final dos datasets
pop_fin=time_series(pop,4)
fert_fin=time_series(fert,4)
exp_fin=time_series(exp,4)

# Divisao em X_train, X_test e y_train, y_test
y_pop = pop_fin["dados"]
X_pop = pop_fin[list(pop_fin.columns)[:-1]]
X_pop_train, X_pop_test, y_pop_train, y_pop_test = train_test_split(X_pop, y_pop, test_size = 0.3, random_state=314)

y_fert = fert_fin["dados"]
X_fert = fert_fin[list(fert_fin.columns)[:-1]]
X_fert_train, X_fert_test, y_fert_train, y_fert_test = train_test_split(X_fert, y_fert, test_size = 0.3, random_state=314)

y_exp = exp_fin["dados"]
X_exp = exp_fin[list(exp_fin.columns)[:-1]]
X_exp_train, X_exp_test, y_exp_train, y_exp_test = train_test_split(X_exp, y_exp, test_size = 0.3, random_state=314)

Com isto feito, podemos passar ao treino dos modelos

## Escolha dos parametros e treino de dos modelos

### Populacao

Treino de 2 modelos, LinearRegression e DecisionTreeRegressor

In [11]:
# LinearRegression
params_linear = [
    {"alpha" : [x*0.1 for x in range(1,10)]}]

grid_search_lasso_pop = GridSearchCV(
    Lasso(), params_linear, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)

grid_search_lasso_pop.fit(X_pop_train, y_pop_train)
print(grid_search_lasso_pop.best_params_)
print("Melhores Parâmetros:", grid_search_lasso_pop.best_params_,"\n")
preds= grid_search_lasso_pop.predict(X_pop_test)
printRegStatistics(y_pop_test, preds)

{'alpha': 0.9}
Melhores Parâmetros: {'alpha': 0.9} 

The RVE is:  0.9999941495255327
The rmse is:  1630219.0785450833
The Correlation Score is is: 1.0000 (p-value=0.000000e+00)

The Maximum Error is is:  12067018.727499008
The Mean Absolute Error is:  537591.1326022077


In [12]:
# DecisionTreeRegressor
params =[{"max_depth" : [4,8,12,15,20]}]

grid_search_dtr_pop = GridSearchCV(
    DecisionTreeRegressor(), params, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)


grid_search_dtr_pop.fit(X_pop_train, y_pop_train)
print(grid_search_dtr_pop.best_params_)
print("Melhores Parâmetros:", grid_search_dtr_pop.best_params_,"\n")
preds= grid_search_dtr_pop.predict(X_pop_test)
printRegStatistics(y_pop_test, preds)

{'max_depth': 12}
Melhores Parâmetros: {'max_depth': 12} 

The RVE is:  0.9998922203894796
The rmse is:  6998365.024948636
The Correlation Score is is: 0.9999 (p-value=0.000000e+00)

The Maximum Error is is:  152128957.0
The Mean Absolute Error is:  1406044.8957204218


In [13]:
# Melhor modelo eh o DecisionTreeRegressor
pop_model = grid_search_dtr_pop

### Fertility

Treino de 2 modelos, LinearRegression e DecisionTreeRegressor

In [14]:
# LinearRegression
params_linear = [
    {"alpha" : [x*0.1 for x in range(1,10)]}]

grid_search_lasso_fert = GridSearchCV(
    Lasso(), params_linear, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)

grid_search_lasso_fert.fit(X_fert_train, y_fert_train)
print(grid_search_lasso_fert.best_params_)
print("Melhores Parâmetros:", grid_search_lasso_fert.best_params_,"\n")
preds= grid_search_lasso_fert.predict(X_fert_test)
printRegStatistics(y_fert_test, preds)

{'alpha': 0.1}
Melhores Parâmetros: {'alpha': 0.1} 

The RVE is:  0.9977762259078578
The rmse is:  0.09277534454384653
The Correlation Score is is: 0.9993 (p-value=0.000000e+00)

The Maximum Error is is:  1.7453203082377713
The Mean Absolute Error is:  0.06721876638522725


In [15]:
# DecisionTreeRegressor
params =[{"max_depth" : [4,8,12,15,20]}]

grid_search_dtr_fert= GridSearchCV(
    DecisionTreeRegressor(), params, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)


grid_search_dtr_fert.fit(X_fert_train, y_fert_train)
print(grid_search_dtr_fert.best_params_)
print("Melhores Parâmetros:", grid_search_dtr_fert.best_params_,"\n")
preds= grid_search_dtr_fert.predict(X_fert_test)
printRegStatistics(y_fert_test, preds)

{'max_depth': 12}
Melhores Parâmetros: {'max_depth': 12} 

The RVE is:  0.9989573889047227
The rmse is:  0.06351256261023491
The Correlation Score is is: 0.9995 (p-value=0.000000e+00)

The Maximum Error is is:  1.7751781798461548
The Mean Absolute Error is:  0.0339707249485184


In [16]:
# Melhor modelo eh o DecisionTreeRegressor
fertility_model = grid_search_dtr_fert

### Life Expectancy

Treino de 2 modelos, LinearRegression e DecisionTreeRegressor

In [17]:
# LinearRegression
params_linear = [
    {"alpha" : [x*0.01 for x in range(1,20)]}]

grid_search_lasso_exp = GridSearchCV(
    Lasso(), params_linear, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)

grid_search_lasso_exp.fit(X_exp_train, y_exp_train)
print(grid_search_lasso_exp.best_params_)
print("Melhores Parâmetros:", grid_search_lasso_exp.best_params_,"\n")
preds= grid_search_lasso_exp.predict(X_exp_test)
printRegStatistics(y_exp_test, preds)

{'alpha': 0.01}
Melhores Parâmetros: {'alpha': 0.01} 

The RVE is:  0.9994918033284109
The rmse is:  0.23945833190789442
The Correlation Score is is: 0.9997 (p-value=0.000000e+00)

The Maximum Error is is:  3.2080776223917944
The Mean Absolute Error is:  0.11702207101954783


In [18]:
# DecisionTreeRegressor
params =[{"max_depth" : [4,8,12,15,20,25,30]}]

grid_search_dtr_exp = GridSearchCV(
    DecisionTreeRegressor(), params, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)


grid_search_dtr_exp.fit(X_exp_train, y_exp_train)
print(grid_search_dtr_exp.best_params_)
print("Melhores Parâmetros:", grid_search_dtr_exp.best_params_,"\n")
preds= grid_search_dtr_exp.predict(X_exp_test)
printRegStatistics(y_exp_test, preds)

{'max_depth': 30}
Melhores Parâmetros: {'max_depth': 30} 

The RVE is:  0.9991075770504605
The rmse is:  0.3172728006700416
The Correlation Score is is: 0.9996 (p-value=0.000000e+00)

The Maximum Error is is:  4.792000000000002
The Mean Absolute Error is:  0.1625526745023574


In [19]:
# Melhor modelo eh o DecisionTreeRegressor
expectancy_model = grid_search_dtr_exp

## Previsao de Valores de 2017

Escolher a sample de 10 países aleatórios

In [20]:
sample = list(fertility["Country Name"].sample(n=10, random_state=483))
print(sample)

['Lesotho', 'North America', 'New Zealand', 'Singapore', 'Europe & Central Asia', 'Qatar', 'Uzbekistan', 'Estonia', 'Angola', 'Japan']


### Populacao

In [21]:
PrevisionDataPop = []

for country in sample:
    x_prevision, delta_prevision = predictAndCalculateDelta(pop_model, pop_fin, country, 2017, lag=4)
    x_real, delta_real = determineRealResults(pop_2020, country, 2017)
    _, x_rel_error = calculateErrors(x_real, x_prevision)
    _, delta_rel_error = calculateErrors(delta_real, delta_prevision)
    data_entry =  (country, x_real, x_prevision, x_rel_error, delta_real, delta_prevision, delta_rel_error)
    PrevisionDataPop.append(data_entry)

# Criar dataframe
PopDF = DFCreate(PrevisionDataPop, 2017)
PopDF

Unnamed: 0,Country,2017 Real Value,2017 Prevision,Relative Error %,2017 Real Delta,2017 Delta Prevision,Relative Error %.1
0,Lesotho,2170617.0,2288041.866,5.41,26745.0,84220.866,214.903
1,North America,361731237.0,368542943.0,1.883,2485441.0,9085450.0,265.547
2,New Zealand,4813600.0,4750805.13,1.305,99500.0,57605.13,42.105
3,Singapore,5612253.0,5756850.87,2.576,4970.0,149567.87,2909.414
4,Europe & Central Asia,915855416.0,911847278.0,0.438,3480711.0,0.0,100.0
5,Qatar,2711755.0,2607024.527,3.862,116589.0,37220.527,68.075
6,Uzbekistan,32388600.0,32291283.2,0.3,540700.0,443383.2,17.998
7,Estonia,1317384.0,1334976.531,1.335,1594.0,19186.531,1103.672
8,Angola,30208628.0,29404901.333,2.661,1053882.0,591438.333,43.88
9,Japan,126972000.0,127889058.824,0.722,-104000.0,894547.824,960.142


### Fertility Rate

In [22]:
PrevisionDataFert = []

for country in sample:
    x_prevision, delta_prevision = predictAndCalculateDelta(fertility_model, fert_fin, country, 2017, lag=4)
    x_real, delta_real = determineRealResults(fert_2020, country, 2017)
    _, x_rel_error = calculateErrors(x_real, x_prevision)
    _, delta_rel_error = calculateErrors(delta_real, delta_prevision)
    data_entry =  (country, x_real, x_prevision, x_rel_error, delta_real, delta_prevision, delta_rel_error)
    PrevisionDataFert.append(data_entry)

# Criar dataframe
FerDF = DFCreate(PrevisionDataFert, 2017)
FerDF

Unnamed: 0,Country,2017 Real Value,2017 Prevision,Relative Error %,2017 Real Delta,2017 Delta Prevision,Relative Error %.1
0,Lesotho,3.187,3.075,3.528,-0.052,-0.019,62.598
1,North America,1.743,1.764,1.204,-0.055,-0.016,70.75
2,New Zealand,1.81,1.837,1.479,-0.06,-0.033,44.619
3,Singapore,1.16,1.27,9.483,-0.04,0.07,275.0
4,Europe & Central Asia,1.735,1.734,0.068,-0.047,-0.016,64.814
5,Qatar,1.839,1.885,2.492,0.012,-0.022,284.818
6,Uzbekistan,2.419,2.434,0.616,-0.036,-0.021,41.395
7,Estonia,1.59,1.595,0.304,-0.01,0.015,248.264
8,Angola,5.6,5.614,0.25,-0.086,-0.08,6.977
9,Japan,1.43,1.446,1.109,-0.01,0.006,158.567


### Life Expectancy

In [23]:
PrevisionDataExp = []

for country in sample:
    x_prevision, delta_prevision = predictAndCalculateDelta(expectancy_model, exp_fin, country, 2017, lag=4)
    x_real, delta_real = determineRealResults(exp_2020, country, 2017)
    _, x_rel_error = calculateErrors(x_real, x_prevision)
    _, delta_rel_error = calculateErrors(delta_real, delta_prevision)
    data_entry =  (country, x_real, x_prevision, x_rel_error, delta_real, delta_prevision, delta_rel_error)
    PrevisionDataExp.append(data_entry)

# Criar dataframe
ExpDF = DFCreate(PrevisionDataExp, 2017)
ExpDF

Unnamed: 0,Country,2017 Real Value,2017 Prevision,Relative Error %,2017 Real Delta,2017 Delta Prevision,Relative Error %.1
0,Lesotho,53.064,54.709,3.1,0.8,0.535,33.125
1,North America,78.879,79.055,0.223,0.002,0.0,100.0
2,New Zealand,81.659,82.037,0.463,0.046,0.424,820.106
3,Singapore,83.095,82.978,0.141,0.249,0.183,26.471
4,Europe & Central Asia,77.779,77.242,0.689,0.213,-0.04,118.582
5,Qatar,80.717,78.48,2.771,0.284,0.296,4.397
6,Uzbekistan,71.01,71.594,0.823,0.255,0.28,9.912
7,Estonia,78.093,77.878,0.275,0.451,0.141,68.649
8,Angola,61.68,62.007,0.53,0.588,0.46,21.769
9,Japan,84.1,83.985,0.137,0.115,0.0,100.0
