## Aprendizagem Automática - Projeto

In [75]:
# Imports
import pandas as pd
import math

# Models and selection methods
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR
# Linear regression metrics
from sklearn.metrics import explained_variance_score, mean_squared_error, max_error, mean_absolute_error
from scipy.stats import pearsonr
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import LabelBinarizer

In [76]:
# Funcoes auxiliares

def printRegStatistics(truth, preds):
    print("The RVE is: ", explained_variance_score(truth, preds))
    print("The rmse is: ", math.sqrt(mean_squared_error(truth, preds, squared=False)))
    corr, pval = pearsonr(truth, preds)
    print("The Correlation Score is is: %6.4f (p-value=%e)\n"%(corr,pval))
    print("The Maximum Error is is: ", max_error(truth, preds))
    print("The Mean Absolute Error is: ", mean_absolute_error(truth, preds))
    

#Devolve uma lista com quantos elementos há seguidos entre valores de NaN. p.e: [1,nan,2,3,nan,nan,2,nan,2,3,4] -> [1,2,1,3]
def count_not_nan(serie):
    conta=0
    lista_seguida=[]
    for i in range(len(serie)):
        if serie[i]==1:
            conta+=1
        else:
            if conta!=0:
                lista_seguida.append(conta)
                conta=0
            else:
                conta=0
                
        if i==len(serie)-1 and conta!=0:
            lista_seguida.append(conta)
    return lista_seguida

#Função que lida com os nan nas df. Se o país não tiver nenhum dado -> drop. Se o país não tiver pelo menos lag+1 dados seguidos
#-> drop. O resto fica

def drop_nan(df,lag):
    pais_nan=[]
    for pais in df["Country Name"].index:
        nan_val=list(pd.DataFrame(df.iloc[pais][4:]).count(axis=1)) #ver se os valores são nan
        if sum(nan_val)==0: #se for tudo nan
            pais_nan.append(pais)
        elif max(count_not_nan(nan_val))<=lag: #Se não houver mais do que lag valores seguidos, drop
            pais_nan.append(pais)

    df=df.drop(pais_nan, axis=0)
    df = df.reset_index(drop=True)
    return df

#Função que escolha o dataframe até um certo ano apenas, que remove os países que contenham nan nesse intervalo de tempo

def indexing(dataf, ano_i):
    X_list=dataf.drop(columns=dataf.iloc[:,[1,2,3]]+dataf.loc[:,[str(i) for i in range(1960,ano_i)]])
    X_list=X_list.T
    X_list=X_list.rename(columns=X_list.iloc[0]).drop(X_list.index[0])
    return X_list

#Cria um DataFrame com os dados de um país e respetivos lags

def lags(df, lag, país):
    new_df=pd.DataFrame()
    new_df["dados"]=df[país]
    colunas = [str(i) + "lag" for i in range(1,lag+1)]

    for i,k in enumerate(colunas):
        new_df[k] = new_df["dados"].shift(+i+1)
    new_df=new_df.dropna()
    return new_df

#Função que cria o dataset necessário para as timeseries. As time series são todas unidas numa só coluna e os países respetivos
# a essa série temporal recebem um 1. Também concatena os lags no início da df.

def time_series(df, lag):
    binario = LabelBinarizer().fit(df.columns)
    df_final= pd.DataFrame()
    for pais in df.columns:
        pais_bin=pd.DataFrame(data=list(binario.transform([pais]))*len(df[pais]),
                             columns=binario.classes_,
                             index=df.index)
        #Agora juntamos os lags e os dados respetivos
        df_mid=pd.DataFrame()
        df_mid=pd.concat((df_mid,lags(df, lag, pais),pais_bin),axis=1).dropna()

        #Finalmente, concatenamos tudo numa só DataFrame
        df_final = pd.concat((df_final, df_mid), sort=True)
        df_fina=df_final.dropna()
    return df_final

# Função para criar um dataframe X para um dado dataset, pais e ano, este dataframe depois eh utilizado para fazer predict
def createX(dataset : pd.DataFrame, country : str, year : int, lag) -> pd.DataFrame:
    previousYearCountryData = dataset.loc[(dataset[country] == 1) & (dataset.index.get_level_values(0) == str(year - 1))]
    newX = previousYearCountryData.copy()
    newX.index = [str(year)]
    for i in range(lag,2,-1):
        newX.at[str(year), str(i)+"lag"] = newX.at[str(year),str(i-1)+"lag"]
    newX.at[str(year), "1lag"] = newX.at[str(year),"dados"]
    newX.drop(["dados"], axis=1, inplace=True)
    return newX

# Função para prever dados novos para um dado dataset ano e pais a partir do modelo treinado dado
def predictAndCalculateDelta(model, dataset : pd.DataFrame, country : str, year : int, lag):
    previousYearCountryData = dataset.loc[(dataset[country] == 1) & (dataset.index.get_level_values(0) == str(year - 1))]
    newX = createX(dataset, country, year, lag)
    x = model.predict(newX)
    x_previous = previousYearCountryData.at[str(year-1),"dados"]
    delta = x - x_previous
    return x[0], delta[0]

# Vai buscar o valor para um dado ano e pais e calcula o delta
def determineRealResults(dataset : pd.DataFrame, country : str, year : int):
    real_data = dataset.at[str(year), country]
    previous_year_data = dataset.at[str(year-1), country]
    delta = real_data - previous_year_data
    return real_data, delta

# Erro relativo e absoluto
def calculateErrors(real, pred):
    abs_error = abs(pred - real)
    relative_error = abs_error / abs(real)
    return abs_error, (relative_error*100)

#Cria os dataframes para o treino e teste

def set_train_test(df,lag): 
    train_set=pd.DataFrame()
    test_set=pd.DataFrame()
    paises=list(df.columns[lag:-1])
    
    for pais in paises:
        dados_pais=df[df[str(pais)]==1]
        pais_train= dados_pais[:int(0.7*(len(dados_pais.index)))] #Training set com 70% dos dados
        pais_test=dados_pais[int(0.7*(len(dados_pais.index))):] #Test de 30%
        
        train_set=pd.concat((train_set,pais_train))
        test_set=pd.concat((test_set, pais_test))
   
    return train_set, test_set

# Criar dataframe com os resultados finais
def DFCreate(PrevisionData, year):
    # Criar dataframe
    df = pd.DataFrame(PrevisionData, columns=['Country', str(year) + ' Real Value', str(year) + ' Prevision','Relative Error %',str(year) + ' Real Delta',str(year) + ' Delta Prevision','Relative Error %'])
    # Converter valores em notacao legivel
    df = df.loc[:, df.columns == 'Country'].join(df.loc[:, df.columns != 'Country'].applymap(lambda x: '{:,.3f}'.format(x)))
    return df

Importação dos datasets

In [36]:
# Dados 1960-2016
fertility = pd.read_csv("fertility_rate.csv")
population= pd.read_csv("country_population.csv")
expectancy = pd.read_csv("life_expectancy.csv")

fertility=drop_nan(fertility,4)
population=drop_nan(population,4)
expectancy=drop_nan(expectancy,4)

# Dados 1960-2020
fertility_2020 = pd.read_csv("fertility_2020.csv")
population_2020 = pd.read_csv("population_2020.csv")
expectancy_2020 = pd.read_csv("life_2020.csv")

#Criar os data frames
fert=indexing(fertility, 1960)
pop=indexing(population, 1960)
exp=indexing(expectancy, 1960)

fert_2020 = indexing(fertility_2020, 1960)
pop_2020 = indexing(population_2020, 1960)
exp_2020 = indexing(expectancy_2020, 1960)

Vamos continuar a transformar o nosso dataset, de forma a que fiquem todas as séries temporais numa só coluna, a referência respetiva de que país se refere a série temporal e os respetivos lags para cada valor. 

In [58]:
#Versão final dos datasets
pop_fin=time_series(pop,4)
fert_fin=time_series(fert,4)
exp_fin=time_series(exp,4)

#Divisão em Train e Test
pop_train, pop_test=set_train_test(pop_fin,4)
exp_train, exp_test=set_train_test(exp_fin,4)
fert_train, fert_test=set_train_test(fert_fin,4)

# Divisao em X e y train e test
y_pop_train, y_pop_test = pop_train["dados"], pop_test["dados"]
X_pop_train, X_pop_test = pop_train[list(pop_train.columns)[:-1]],  pop_test[list(pop_test.columns)[:-1]]

y_fert_train, y_fert_test = fert_train["dados"], fert_test["dados"]
X_fert_train, X_fert_test = fert_train[list(fert_train.columns)[:-1]],  fert_test[list(fert_test.columns)[:-1]]

y_exp_train, y_exp_test = exp_train["dados"], exp_test["dados"]
X_exp_train, X_exp_test = exp_train[list(exp_train.columns)[:-1]],  exp_test[list(exp_test.columns)[:-1]]

models_pop = []
models_fert = []
models_exp = []

Com isto feito, podemos passar ao treino dos modelos

## Escolha dos parametros e treino de dos modelos

### Populacao

Treino de 2 modelos, LinearRegression e DecisionTreeRegressor

In [77]:
# LinearRegression
params_linear = [
    {"alpha" : [x*0.1 for x in range(1,10)]}]

grid_search_lasso_pop = GridSearchCV(
    Lasso(), params_linear, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)

grid_search_lasso_pop.fit(X_pop_train, y_pop_train)
print(grid_search_lasso_pop.best_params_)
print("Melhores Parâmetros:", grid_search_lasso_pop.best_params_,"\n")
preds= grid_search_lasso_pop.predict(X_pop_test)
printRegStatistics(y_pop_test, preds)
models_pop.append((pearsonr(y_pop_test, preds), grid_search_lasso_pop.best_estimator_))

{'alpha': 0.9}
Melhores Parâmetros: {'alpha': 0.9} 

The RVE is:  0.9999885194851839
The rmse is:  1722.3915251009319
The Correlation Score is is: 1.0000 (p-value=0.000000e+00)

The Maximum Error is is:  18163432.380937576
The Mean Absolute Error is:  995558.8324880867


In [78]:
# DecisionTreeRegressor
params =[{"max_depth" : [4,8,12,15,20]}]

grid_search_dtr_pop = GridSearchCV(
    DecisionTreeRegressor(), params, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)


grid_search_dtr_pop.fit(X_pop_train, y_pop_train)
print(grid_search_dtr_pop.best_params_)
print("Melhores Parâmetros:", grid_search_dtr_pop.best_params_,"\n")
preds= grid_search_dtr_pop.predict(X_pop_test)
printRegStatistics(y_pop_test, preds)
models_pop.append((pearsonr(y_pop_test, preds), grid_search_dtr_pop.best_estimator_))

{'max_depth': 15}
Melhores Parâmetros: {'max_depth': 15} 

The RVE is:  0.996675457961153
The rmse is:  7091.417994350061
The Correlation Score is is: 0.9984 (p-value=0.000000e+00)

The Maximum Error is is:  1322344486.0
The Mean Absolute Error is:  5322743.341226783


In [79]:
# Melhor modelo é o Lasso
pop_model =max(models_pop, key= lambda x: x[0])[1]
pop_model

Lasso(alpha=0.9)

### Fertility

Treino de 2 modelos, LinearRegression e DecisionTreeRegressor

In [80]:
# LinearRegression
params_linear = [
    {"alpha" : [x*0.1 for x in range(1,10)]}]

grid_search_lasso_fert = GridSearchCV(
    Lasso(), params_linear, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)

grid_search_lasso_fert.fit(X_fert_train, y_fert_train)
print(grid_search_lasso_fert.best_params_)
print("Melhores Parâmetros:", grid_search_lasso_fert.best_params_,"\n")
preds= grid_search_lasso_fert.predict(X_fert_test)
printRegStatistics(y_fert_test, preds)
models_fert.append((pearsonr(y_fert_test, preds), grid_search_lasso_fert.best_estimator_))

{'alpha': 0.1}
Melhores Parâmetros: {'alpha': 0.1} 

The RVE is:  0.9990869466529136
The rmse is:  0.21614422575734218
The Correlation Score is is: 0.9996 (p-value=0.000000e+00)

The Maximum Error is is:  0.47700920792216994
The Mean Absolute Error is:  0.03342941410012236


In [81]:
# DecisionTreeRegressor
params =[{"max_depth" : [4,8,12,15,20]}]

grid_search_dtr_fert= GridSearchCV(
    DecisionTreeRegressor(), params, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)


grid_search_dtr_fert.fit(X_fert_train, y_fert_train)
print(grid_search_dtr_fert.best_params_)
print("Melhores Parâmetros:", grid_search_dtr_fert.best_params_,"\n")
preds= grid_search_dtr_fert.predict(X_fert_test)
printRegStatistics(y_fert_test, preds)
models_fert.append((pearsonr(y_fert_test, preds), grid_search_dtr_fert.best_estimator_))

{'max_depth': 12}
Melhores Parâmetros: {'max_depth': 12} 

The RVE is:  0.9990305113766994
The rmse is:  0.21841711945948986
The Correlation Score is is: 0.9995 (p-value=0.000000e+00)

The Maximum Error is is:  0.4947283418918915
The Mean Absolute Error is:  0.030542034057430275


In [82]:
# Melhor modelo é o Lasso
fertility_model=max(models_fert, key= lambda x: x[0])[1]
fertility_model

Lasso(alpha=0.1)

### Life Expectancy

Treino de 2 modelos, LinearRegression e DecisionTreeRegressor

In [83]:
# LinearRegression
params_linear = [
    {"alpha" : [x*0.1 for x in range(1,10)]}]

grid_search_lasso_exp = GridSearchCV(
    Lasso(), params_linear, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)

grid_search_lasso_exp.fit(X_exp_train, y_exp_train)
print(grid_search_lasso_exp.best_params_)
print("Melhores Parâmetros:", grid_search_lasso_exp.best_params_,"\n")
preds= grid_search_lasso_exp.predict(X_exp_test)
printRegStatistics(y_exp_test, preds)
models_exp.append((pearsonr(y_exp_test, preds), grid_search_lasso_exp.best_estimator_))

{'alpha': 0.1}
Melhores Parâmetros: {'alpha': 0.1} 

The RVE is:  0.9993828712513287
The rmse is:  0.47660906393492997
The Correlation Score is is: 0.9997 (p-value=0.000000e+00)

The Maximum Error is is:  1.8094328860308337
The Mean Absolute Error is:  0.13869870754119568


In [84]:
# DecisionTreeRegressor
params =[{"max_depth" : [4,8,12,15,20,25,30]}]

grid_search_dtr_exp = GridSearchCV(
    DecisionTreeRegressor(), params, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)


grid_search_dtr_exp.fit(X_exp_train, y_exp_train)
print(grid_search_dtr_exp.best_params_)
print("Melhores Parâmetros:", grid_search_dtr_exp.best_params_,"\n")
preds= grid_search_dtr_exp.predict(X_exp_test)
printRegStatistics(y_exp_test, preds)
models_exp.append((pearsonr(y_exp_test, preds), grid_search_dtr_exp.best_estimator_))

{'max_depth': 30}
Melhores Parâmetros: {'max_depth': 30} 

The RVE is:  0.9986150924553947
The rmse is:  0.5753026202882512
The Correlation Score is is: 0.9993 (p-value=0.000000e+00)

The Maximum Error is is:  3.207317070000002
The Mean Absolute Error is:  0.18594793223623743


In [85]:
# Melhor modelo é o Lasso
expectancy_model=max(models_exp, key= lambda x: x[0])[1]
expectancy_model

Lasso(alpha=0.1)

## Previsao de Valores de 2017

Escolher a sample de 10 países aleatórios

In [86]:
sample = list(fertility["Country Name"].sample(n=10, random_state=483))
print(sample)

['Lesotho', 'North America', 'New Zealand', 'Singapore', 'Europe & Central Asia', 'Qatar', 'Uzbekistan', 'Estonia', 'Angola', 'Japan']


### Populacao

In [87]:
PrevisionDataPop = []

for country in sample:
    x_prevision, delta_prevision = predictAndCalculateDelta(pop_model, pop_fin, country, 2017, lag=4)
    x_real, delta_real = determineRealResults(pop_2020, country, 2017)
    _, x_rel_error = calculateErrors(x_real, x_prevision)
    _, delta_rel_error = calculateErrors(delta_real, delta_prevision)
    data_entry =  (country, x_real, x_prevision, x_rel_error, delta_real, delta_prevision, delta_rel_error)
    PrevisionDataPop.append(data_entry)

# Criar dataframe
PopDF = DFCreate(PrevisionDataPop, 2017)
PopDF

Unnamed: 0,Country,2017 Real Value,2017 Prevision,Relative Error %,2017 Real Delta,2017 Delta Prevision,Relative Error %.1
0,Lesotho,2170617.0,2234945.143,2.964,26745.0,31124.143,16.374
1,North America,361731237.0,362874118.957,0.316,2485441.0,3416625.957,37.466
2,New Zealand,4813600.0,4738004.739,1.57,99500.0,44804.739,54.97
3,Singapore,5612253.0,5684561.57,1.288,4970.0,77278.57,1454.901
4,Europe & Central Asia,915855416.0,917150309.422,0.141,3480711.0,5303031.422,52.355
5,Qatar,2711755.0,2597547.239,4.212,116589.0,27743.239,76.204
6,Uzbekistan,32388600.0,32345347.655,0.134,540700.0,497447.655,7.999
7,Estonia,1317384.0,1318785.593,0.106,1594.0,2995.593,87.929
8,Angola,30208628.0,29206130.323,3.319,1053882.0,392667.323,62.741
9,Japan,126972000.0,127869540.883,0.707,-104000.0,875029.883,941.375


### Fertility Rate

In [88]:
PrevisionDataFert = []

for country in sample:
    x_prevision, delta_prevision = predictAndCalculateDelta(fertility_model, fert_fin, country, 2017, lag=4)
    x_real, delta_real = determineRealResults(fert_2020, country, 2017)
    _, x_rel_error = calculateErrors(x_real, x_prevision)
    _, delta_rel_error = calculateErrors(delta_real, delta_prevision)
    data_entry =  (country, x_real, x_prevision, x_rel_error, delta_real, delta_prevision, delta_rel_error)
    PrevisionDataFert.append(data_entry)

# Criar dataframe
FerDF = DFCreate(PrevisionDataFert, 2017)
FerDF

Unnamed: 0,Country,2017 Real Value,2017 Prevision,Relative Error %,2017 Real Delta,2017 Delta Prevision,Relative Error %.1
0,Lesotho,3.187,3.07,3.683,-0.052,-0.024,53.129
1,North America,1.743,1.789,2.66,-0.055,0.009,117.267
2,New Zealand,1.81,1.877,3.707,-0.06,0.007,111.843
3,Singapore,1.16,1.224,5.546,-0.04,0.024,160.842
4,Europe & Central Asia,1.735,1.76,1.466,-0.047,0.01,121.83
5,Qatar,1.839,1.913,4.032,0.012,0.006,48.716
6,Uzbekistan,2.419,2.447,1.16,-0.036,-0.008,77.946
7,Estonia,1.59,1.595,0.287,-0.01,0.015,245.638
8,Angola,5.6,5.603,0.049,-0.086,-0.091,6.092
9,Japan,1.43,1.458,1.97,-0.01,0.018,281.643


### Life Expectancy

In [89]:
PrevisionDataExp = []

for country in sample:
    x_prevision, delta_prevision = predictAndCalculateDelta(expectancy_model, exp_fin, country, 2017, lag=4)
    x_real, delta_real = determineRealResults(exp_2020, country, 2017)
    _, x_rel_error = calculateErrors(x_real, x_prevision)
    _, delta_rel_error = calculateErrors(delta_real, delta_prevision)
    data_entry =  (country, x_real, x_prevision, x_rel_error, delta_real, delta_prevision, delta_rel_error)
    PrevisionDataExp.append(data_entry)

# Criar dataframe
ExpDF = DFCreate(PrevisionDataExp, 2017)
ExpDF

Unnamed: 0,Country,2017 Real Value,2017 Prevision,Relative Error %,2017 Real Delta,2017 Delta Prevision,Relative Error %.1
0,Lesotho,53.064,54.582,2.861,0.8,0.408,48.988
1,North America,78.879,79.168,0.366,0.002,0.113,6308.166
2,New Zealand,81.659,81.725,0.081,0.046,0.112,143.346
3,Singapore,83.095,82.929,0.2,0.249,0.134,46.058
4,Europe & Central Asia,77.779,77.439,0.436,0.213,0.157,26.34
5,Qatar,80.717,78.341,2.944,0.284,0.157,44.825
6,Uzbekistan,71.01,71.52,0.718,0.255,0.206,19.23
7,Estonia,78.093,77.91,0.235,0.451,0.173,61.677
8,Angola,61.68,61.882,0.327,0.588,0.335,43.094
9,Japan,84.1,84.12,0.024,0.115,0.135,17.668
