## Aprendizagem Automática - Projeto

In [137]:
# Imports
import pandas as pd

# Models and selection methods
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR
# Linear regression metrics
from sklearn.metrics import explained_variance_score, mean_squared_error, max_error, mean_absolute_error
from scipy.stats import pearsonr
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import LabelBinarizer

In [138]:
# Funcoes auxiliares

def printRegStatistics(truth, preds):
    print("The RVE is: ", explained_variance_score(truth, preds))
    print("The rmse is: ", mean_squared_error(truth, preds, squared=False))
    corr, pval = pearsonr(truth, preds)
    print("The Correlation Score is is: %6.4f (p-value=%e)\n"%(corr,pval))
    print("The Maximum Error is is: ", max_error(truth, preds))
    print("The Mean Absolute Error is: ", mean_absolute_error(truth, preds))
    

#Devolve uma lista com quantos elementos há seguidos entre valores de NaN. p.e: [1,nan,2,3,nan,nan,2,nan,2,3,4] -> [1,2,1,3]
def count_not_nan(serie):
    conta=0
    lista_seguida=[]
    for i in range(len(serie)):
        if serie[i]==1:
            conta+=1
        else:
            if conta!=0:
                lista_seguida.append(conta)
                conta=0
            else:
                conta=0
                
        if i==len(serie)-1 and conta!=0:
            lista_seguida.append(conta)
    return lista_seguida

#Função que lida com os nan nas df. Se o país não tiver nenhum dado -> drop. Se o país não tiver pelo menos lag+1 dados seguidos
#-> drop. O resto fica

def drop_nan(df,lag):
    pais_nan=[]
    for pais in df["Country Name"].index:
        nan_val=list(pd.DataFrame(df.iloc[pais][4:]).count(axis=1)) #ver se os valores são nan
        if sum(nan_val)==0: #se for tudo nan
            pais_nan.append(pais)
        elif max(count_not_nan(nan_val))<=lag: #Se não houver mais do que lag valores seguidos, drop
            pais_nan.append(pais)

    df=df.drop(pais_nan, axis=0)
    df = df.reset_index(drop=True)
    return df

#Função que escolha o dataframe até um certo ano apenas, que remove os países que contenham nan nesse intervalo de tempo

def indexing(dataf, ano_i):
    X_list=dataf.drop(columns=dataf.iloc[:,[1,2,3]]+dataf.loc[:,[str(i) for i in range(1960,ano_i)]])
    X_list=X_list.T
    X_list=X_list.rename(columns=X_list.iloc[0]).drop(X_list.index[0])
    return X_list

#Cria um DataFrame com os dados de um país e respetivos lags

def lags(df, lag, país):
    new_df=pd.DataFrame()
    new_df["dados"]=df[país]
    colunas = [str(i) + "lag" for i in range(1,lag+1)]

    for i,k in enumerate(colunas):
        new_df[k] = new_df["dados"].shift(+i+1)
    new_df=new_df.dropna()
    return new_df

#Função que cria o dataset necessário para as timeseries. As time series são todas unidas numa só coluna e os países respetivos
# a essa série temporal recebem um 1. Também concatena os lags no início da df.

def time_series(df, lag):
    binario = LabelBinarizer().fit(df.columns)
    df_final= pd.DataFrame()
    for pais in df.columns:
        pais_bin=pd.DataFrame(data=list(binario.transform([pais]))*len(df[pais]),
                             columns=binario.classes_,
                             index=df.index)
        #Agora juntamos os lags e os dados respetivos
        df_mid=pd.DataFrame()
        df_mid=pd.concat((df_mid,lags(df, lag, pais),pais_bin),axis=1).dropna()

        #Finalmente, concatenamos tudo numa só DataFrame
        df_final = pd.concat((df_final, df_mid), sort=True)
        df_fina=df_final.dropna()
    return df_final

In [139]:
# Mais funcoes auxiliares

# Função para criar um dataframe X para um dado dataset, pais e ano
def createX(dataset : pd.DataFrame, country : str, year : int, lag) -> pd.DataFrame:
    previousYearCountryData = dataset.loc[(dataset[country] == 1) & (dataset.index.get_level_values(0) == str(year - 1))]
    newX = previousYearCountryData.copy()
    newX.index = [str(year)]
    for i in range(lag,2,-1):
        newX.at[str(year), str(i)+"lag"] = newX.at[str(year),str(i-1)+"lag"]
    newX.at[str(year), "1lag"] = newX.at[str(year),"dados"]
    newX.drop(["dados"], axis=1, inplace=True)
    return newX

# Função para prever dados novos para um dado dataset ano e pais a partir do modelo treinado dado
def predictAndCalculateDelta(model, dataset : pd.DataFrame, country : str, year : int, lag):
    previousYearCountryData = dataset.loc[(dataset[country] == 1) & (dataset.index.get_level_values(0) == str(year - 1))]
    newX = createX(dataset, country, year, lag)
    x = model.predict(newX)
    x_previous = previousYearCountryData.at[str(year-1),"dados"]
    delta = x - x_previous
    return x[0], delta[0]

# Vai buscar o valor para um dado ano e pais e calcula o delta
def determineRealResults(dataset : pd.DataFrame, country : str, year : int):
    real_data = dataset.at[str(year), country]
    previous_year_data = dataset.at[str(year-1), country]
    delta = real_data - previous_year_data
    return real_data, delta

# Erro relativo e absoluto
def calculateErrors(real, pred):
    abs_error = abs(pred - real)
    relative_error = abs_error / abs(real)
    return abs_error, (relative_error*100)

# Criar dataframe com os resultados finais
def DFCreate(PrevisionData, year):
    # Criar dataframe
    df = pd.DataFrame(PrevisionData, columns=['Country', str(year) + ' Real Value', str(year) + ' Prevision','Relative Error %',str(year) + ' Real ddDelta',str(year) + ' Delta Prevision','Relative Error %'])
    # Converter valores em notacao legivel
    df = df.loc[:, df.columns == 'Country'].join(df.loc[:, FertDF.columns != 'Country'].applymap(lambda x: '{:,.3f}'.format(x)))
    return df

Importação dos datasets

In [140]:
# Dados 1960-2016
fertility = pd.read_csv("fertility_rate.csv")
population= pd.read_csv("country_population.csv")
expectancy = pd.read_csv("life_expectancy.csv")

fertility=drop_nan(fertility,4)
population=drop_nan(population,4)
expectancy=drop_nan(expectancy,4)

# Dados 1960-2020
fertility_2020 = pd.read_csv("fertility_2020.csv")
population_2020 = pd.read_csv("population_2020.csv")
expectancy_2020 = pd.read_csv("life_2020.csv")

#Criar os data frames

fert=indexing(fertility, 1960)
pop=indexing(population, 1960)
exp=indexing(expectancy, 1960)

fert_2020 = indexing(fertility_2020, 1960)
pop_2020 = indexing(population_2020, 1960)
exp_2020 = indexing(expectancy_2020, 1960)



Vamos continuar a transformar o nosso dataset, de forma a que fiquem todas as séries temporais numa só coluna, a referência respetiva de que país se refere a série temporal e os respetivos lags para cada valor. 

In [141]:
#Versão final dos datasets
pop_fin=time_series(pop,4)
fert_fin=time_series(fert,4)
exp_fin=time_series(exp,4)

# Divisao em X e y
y_pop = pop_fin["dados"]
X_pop = pop_fin[list(pop_fin.columns)[:-1]]

y_fert = fert_fin["dados"]
X_fert = fert_fin[list(fert_fin.columns)[:-1]]

y_exp = exp_fin["dados"]
X_exp = exp_fin[list(exp_fin.columns)[:-1]]

Com isto feito, podemos passar ao treino dos modelos

## Escolha dos parametros e treino de dos modelos

### Populacao

Treino de 2 modelos, LinearRegression e DecisionTreeRegressor

In [142]:
# LinearRegression
params_linear = [
    {"alpha" : [x*0.1 for x in range(1,10)]}]

grid_search_lasso_pop = GridSearchCV(
    Lasso(), params_linear, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)

grid_search_lasso_pop.fit(X_pop, y_pop)
print(grid_search_lasso_pop.best_params_)
print("Melhores Parâmetros:", grid_search_lasso_pop.best_params_,"\n")
preds= grid_search_lasso_pop.predict(X_pop)
printRegStatistics(y_pop, preds)

{'alpha': 0.9}
Melhores Parâmetros: {'alpha': 0.9} 

The RVE is:  0.9999945436199671
The rmse is:  1619501.3254123626
The Correlation Score is is: 1.0000 (p-value=0.000000e+00)

The Maximum Error is is:  11948798.81523943
The Mean Absolute Error is:  530556.1048894892


In [143]:
# DecisionTreeRegressor
params =[{"max_depth" : [4,8,12,15,20]}]

grid_search_dtr_pop = GridSearchCV(
    DecisionTreeRegressor(), params, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)


grid_search_dtr_pop.fit(X_pop, y_pop)
print(grid_search_dtr_pop.best_params_)
print("Melhores Parâmetros:", grid_search_dtr_pop.best_params_,"\n")
preds= grid_search_dtr_pop.predict(X_pop)
printRegStatistics(y_pop, preds)

{'max_depth': 15}
Melhores Parâmetros: {'max_depth': 15} 

The RVE is:  0.9999999972393108
The rmse is:  36428.20396314393
The Correlation Score is is: 1.0000 (p-value=0.000000e+00)

The Maximum Error is is:  884875.6666666865
The Mean Absolute Error is:  15867.714161545682


In [144]:
# Melhor modelo eh o DecisionTreeRegressor
pop_model = grid_search_dtr_pop

### Fertility

Treino de 2 modelos, LinearRegression e DecisionTreeRegressor

In [145]:
# LinearRegression
params_linear = [
    {"alpha" : [x*0.1 for x in range(1,10)]}]

grid_search_lasso_fert = GridSearchCV(
    Lasso(), params_linear, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)

grid_search_lasso_fert.fit(X_fert, y_fert)
print(grid_search_lasso_fert.best_params_)
print("Melhores Parâmetros:", grid_search_lasso_fert.best_params_,"\n")
preds= grid_search_lasso_fert.predict(X_fert)
printRegStatistics(y_fert, preds)

{'alpha': 0.1}
Melhores Parâmetros: {'alpha': 0.1} 

The RVE is:  0.9979575888734916
The rmse is:  0.08871661995659094
The Correlation Score is is: 0.9993 (p-value=0.000000e+00)

The Maximum Error is is:  1.7454652135665294
The Mean Absolute Error is:  0.06636704938394361


In [146]:
# DecisionTreeRegressor
params =[{"max_depth" : [4,8,12,15,20]}]

grid_search_dtr_fert= GridSearchCV(
    DecisionTreeRegressor(), params, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)


grid_search_dtr_fert.fit(X_fert, y_fert)
print(grid_search_dtr_fert.best_params_)
print("Melhores Parâmetros:", grid_search_dtr_fert.best_params_,"\n")
preds= grid_search_dtr_fert.predict(X_fert)
printRegStatistics(y_fert, preds)

{'max_depth': 15}
Melhores Parâmetros: {'max_depth': 15} 

The RVE is:  0.9999435684450303
The rmse is:  0.014746685287636535
The Correlation Score is is: 1.0000 (p-value=0.000000e+00)

The Maximum Error is is:  0.22628988856521692
The Mean Absolute Error is:  0.00673252946155591


In [147]:
# Melhor modelo eh o DecisionTreeRegressor
fertility_model = grid_search_dtr_fert

### Life Expectancy

Treino de 2 modelos, LinearRegression e DecisionTreeRegressor

In [148]:
# LinearRegression
params_linear = [
    {"alpha" : [x*0.01 for x in range(1,20)]}]

grid_search_lasso_exp = GridSearchCV(
    Lasso(), params_linear, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)

grid_search_lasso_exp.fit(X_exp, y_exp)
print(grid_search_lasso_exp.best_params_)
print("Melhores Parâmetros:", grid_search_lasso_exp.best_params_,"\n")
preds= grid_search_lasso_exp.predict(X_exp)
printRegStatistics(y_exp, preds)

{'alpha': 0.01}
Melhores Parâmetros: {'alpha': 0.01} 

The RVE is:  0.9995256429514703
The rmse is:  0.23546571634644062
The Correlation Score is is: 0.9998 (p-value=0.000000e+00)

The Maximum Error is is:  3.4670980408972376
The Mean Absolute Error is:  0.11483406249588855


In [149]:
# DecisionTreeRegressor
params =[{"max_depth" : [4,8,12,15,20,25,30]}]

grid_search_dtr_exp = GridSearchCV(
    DecisionTreeRegressor(), params, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)


grid_search_dtr_exp.fit(X_exp, y_exp)
print(grid_search_dtr_exp.best_params_)
print("Melhores Parâmetros:", grid_search_dtr_exp.best_params_,"\n")
preds= grid_search_dtr_exp.predict(X_exp)
printRegStatistics(y_exp, preds)

{'max_depth': 25}
Melhores Parâmetros: {'max_depth': 25} 

The RVE is:  0.9999999226070806
The rmse is:  0.0030076383831732067
The Correlation Score is is: 1.0000 (p-value=0.000000e+00)

The Maximum Error is is:  0.09094303583331964
The Mean Absolute Error is:  0.0003359126094390135


In [150]:
# Melhor modelo eh o DecisionTreeRegressor
expectancy_model = grid_search_dtr_exp

## Previsao de Valores de 2017

Escolher a sample de 10 países aleatórios

In [151]:
sample = list(fertility["Country Name"].sample(n=10, random_state=483))
print(sample)

['Lesotho', 'North America', 'New Zealand', 'Singapore', 'Europe & Central Asia', 'Qatar', 'Uzbekistan', 'Estonia', 'Angola', 'Japan']


### Populacao

In [152]:
PrevisionDataPop = []

for country in sample:
    x_prevision, delta_prevision = predictAndCalculateDelta(pop_model, pop_fin, country, 2017, lag=4)
    x_real, delta_real = determineRealResults(pop_2020, country, 2017)
    _, x_rel_error = calculateErrors(x_real, x_prevision)
    _, delta_rel_error = calculateErrors(delta_real, delta_prevision)
    data_entry =  (country, x_real, x_prevision, x_rel_error, delta_real, delta_prevision, delta_rel_error)
    PrevisionDataPop.append(data_entry)

# Criar dataframe
PopDF = DFCreate(PrevisionDataPop, 2017)
PopDF

Unnamed: 0,Country,2017 Real Value,2017 Prevision,Relative Error %,2017 Real ddDelta,2017 Delta Prevision,Relative Error %.1
0,Lesotho,2170617.0,2230640.091,2.765,26745.0,26819.091,0.277
1,North America,361731237.0,369246309.0,2.078,2485441.0,9788816.0,293.846
2,New Zealand,4813600.0,4834581.762,0.436,99500.0,141381.762,42.092
3,Singapore,5612253.0,5694347.643,1.463,4970.0,87064.643,1651.804
4,Europe & Central Asia,915855416.0,923253553.0,0.808,3480711.0,11406275.0,227.7
5,Qatar,2711755.0,2690731.0,0.775,116589.0,120927.0,3.721
6,Uzbekistan,32388600.0,32548172.75,0.493,540700.0,700272.75,29.512
7,Estonia,1317384.0,1319176.818,0.136,1594.0,3386.818,112.473
8,Angola,30208628.0,29385348.875,2.725,1053882.0,571885.875,45.735
9,Japan,126972000.0,126994511.0,0.018,-104000.0,0.0,100.0


### Fertility Rate

In [153]:
PrevisionDataFert = []

for country in sample:
    x_prevision, delta_prevision = predictAndCalculateDelta(fertility_model, fert_fin, country, 2017, lag=4)
    x_real, delta_real = determineRealResults(fert_2020, country, 2017)
    _, x_rel_error = calculateErrors(x_real, x_prevision)
    _, delta_rel_error = calculateErrors(delta_real, delta_prevision)
    data_entry =  (country, x_real, x_prevision, x_rel_error, delta_real, delta_prevision, delta_rel_error)
    PrevisionDataFert.append(data_entry)

# Criar dataframe
FerDF = DFCreate(PrevisionDataFert, 2017)
FerDF

Unnamed: 0,Country,2017 Real Value,2017 Prevision,Relative Error %,2017 Real ddDelta,2017 Delta Prevision,Relative Error %.1
0,Lesotho,3.187,3.04,4.602,-0.052,-0.054,3.231
1,North America,1.743,1.749,0.37,-0.055,-0.031,44.12
2,New Zealand,1.81,1.838,1.527,-0.06,-0.032,46.059
3,Singapore,1.16,1.29,11.207,-0.04,0.09,325.0
4,Europe & Central Asia,1.735,1.757,1.26,-0.047,0.007,114.158
5,Qatar,1.839,1.892,2.856,0.012,-0.015,228.917
6,Uzbekistan,2.419,2.457,1.561,-0.036,0.002,104.886
7,Estonia,1.59,1.59,0.018,-0.01,0.01,197.177
8,Angola,5.6,5.609,0.161,-0.086,-0.085,1.163
9,Japan,1.43,1.453,1.633,-0.01,0.013,233.557


### Life Expectancy

In [154]:
PrevisionDataExp = []

for country in sample:
    x_prevision, delta_prevision = predictAndCalculateDelta(expectancy_model, exp_fin, country, 2017, lag=4)
    x_real, delta_real = determineRealResults(exp_2020, country, 2017)
    _, x_rel_error = calculateErrors(x_real, x_prevision)
    _, delta_rel_error = calculateErrors(delta_real, delta_prevision)
    data_entry =  (country, x_real, x_prevision, x_rel_error, delta_real, delta_prevision, delta_rel_error)
    PrevisionDataExp.append(data_entry)

# Criar dataframe
ExpDF = DFCreate(PrevisionDataExp, 2017)
ExpDF

Unnamed: 0,Country,2017 Real Value,2017 Prevision,Relative Error %,2017 Real ddDelta,2017 Delta Prevision,Relative Error %.1
0,Lesotho,53.064,54.709,3.1,0.8,0.535,33.125
1,North America,78.879,79.154,0.349,0.002,0.099,5539.643
2,New Zealand,81.659,81.612,0.056,0.046,0.0,100.0
3,Singapore,83.095,82.795,0.361,0.249,0.0,100.0
4,Europe & Central Asia,77.779,77.741,0.048,0.213,0.459,115.178
5,Qatar,80.717,78.438,2.824,0.284,0.254,10.59
6,Uzbekistan,71.01,71.459,0.632,0.255,0.145,43.319
7,Estonia,78.093,77.678,0.531,0.451,-0.059,112.973
8,Angola,61.68,61.899,0.355,0.588,0.352,40.136
9,Japan,84.1,84.278,0.212,0.115,0.293,155.202
