In [1]:
import pandas as pd 

# On importe le fichier créé à l'étape de data visualisation.
df=pd.read_csv('../data/df_global.csv')

df.head()

Unnamed: 0,Country name,Regional indicator,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,South Asia,2008,3.724,7.37,0.451,50.8,0.718,0.168,0.882,0.518,0.258
1,Afghanistan,South Asia,2009,4.402,7.54,0.552,51.2,0.679,0.19,0.85,0.584,0.237
2,Afghanistan,South Asia,2010,4.758,7.647,0.539,51.6,0.6,0.121,0.707,0.618,0.275
3,Afghanistan,South Asia,2011,3.832,7.62,0.521,51.92,0.496,0.162,0.731,0.611,0.267
4,Afghanistan,South Asia,2012,3.783,7.705,0.521,52.24,0.531,0.236,0.776,0.71,0.268


In [2]:
# Le dataset a déjà été nettoyé (ni valeur manquante, ni doublon).

# Les colonnes 'Country name' et 'year' sont enlevé car elle n'apporte pas de valeur particulière au modèle.
df_forML = df.drop(['Country name','year'],axis=1)
df_forML.head()

Unnamed: 0,Regional indicator,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,South Asia,3.724,7.37,0.451,50.8,0.718,0.168,0.882,0.518,0.258
1,South Asia,4.402,7.54,0.552,51.2,0.679,0.19,0.85,0.584,0.237
2,South Asia,4.758,7.647,0.539,51.6,0.6,0.121,0.707,0.618,0.275
3,South Asia,3.832,7.62,0.521,51.92,0.496,0.162,0.731,0.611,0.267
4,South Asia,3.783,7.705,0.521,52.24,0.531,0.236,0.776,0.71,0.268


In [3]:
# On enregistre le dataset dans un fichier CSV.
df_forML.to_csv('../data/df_ml.csv', index=False)

In [4]:
# Séparation des paramètres et de la variable cible.
params = df_forML.drop('Life Ladder', axis=1)
target = df_forML['Life Ladder']

In [5]:
# Instanciation des jeux d'entraînement et de test. 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(params,
                                                    target,
                                                    test_size=0.2,
                                                    random_state=42)

In [6]:
X_train.head()

Unnamed: 0,Regional indicator,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
1591,Middle East and North Africa,10.798,0.82,65.4,0.82,-0.045,0.5065,0.724,0.327
1379,Sub-Saharan Africa,8.484,0.739,50.5,0.713,0.099,0.913,0.744,0.316
414,Sub-Saharan Africa,8.136,0.621,57.9,0.699,-0.092,0.738,0.588,0.448
704,Latin America and Caribbean,8.935,0.802,62.82,0.865,0.02,0.821,0.863,0.349
755,East Asia,10.887,0.846,76.82,0.894,0.235,0.245,0.734,0.196


In [7]:
# utilisation du OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(drop="first", sparse=False)

# Adaptation de l'encodeur aux données d'entraînement
encoded_train = ohe.fit_transform(X_train[['Regional indicator']])

# Transformation des données de test
encoded_test = ohe.transform(X_test[['Regional indicator']])

# Remplacement des colonnes originales par les nouvelles colonnes transformées
cat_train = pd.DataFrame(encoded_train, columns=ohe.get_feature_names_out(['Regional indicator']), index=X_train.index)
cat_test = pd.DataFrame(encoded_test, columns=ohe.get_feature_names_out(['Regional indicator']), index=X_test.index)



In [8]:
# Suppression de la colonne "Regional indicator dans les jeux d'entraînement et de test."
X_train = X_train.drop('Regional indicator', axis=1)
X_test = X_test.drop('Regional indicator', axis=1)

In [9]:
# Concaténation des jeux avec le jeu encodé
X_train = pd.concat([X_train,cat_train], axis = 1)
X_test = pd.concat([X_test,cat_test], axis = 1)

In [10]:
X_train

Unnamed: 0,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,Regional indicator_Commonwealth of Independent States,Regional indicator_East Asia,Regional indicator_Latin America and Caribbean,Regional indicator_Middle East and North Africa,Regional indicator_North America and ANZ,Regional indicator_South Asia,Regional indicator_Southeast Asia,Regional indicator_Sub-Saharan Africa,Regional indicator_Western Europe
1591,10.798,0.820,65.400,0.820,-0.045,0.5065,0.724,0.3270,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1379,8.484,0.739,50.500,0.713,0.099,0.9130,0.744,0.3160,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
414,8.136,0.621,57.900,0.699,-0.092,0.7380,0.588,0.4480,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
704,8.935,0.802,62.820,0.865,0.020,0.8210,0.863,0.3490,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
755,10.887,0.846,76.820,0.894,0.235,0.2450,0.734,0.1960,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1638,7.449,0.611,52.400,0.718,0.074,0.8740,0.513,0.4380,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1095,11.592,0.952,71.700,0.908,0.096,0.4230,0.809,0.2160,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1130,6.958,0.537,57.948,0.780,0.038,0.7290,0.687,0.2615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1294,7.686,0.818,58.200,0.618,0.291,0.9000,0.745,0.1530,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [11]:
X_test

Unnamed: 0,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,Regional indicator_Commonwealth of Independent States,Regional indicator_East Asia,Regional indicator_Latin America and Caribbean,Regional indicator_Middle East and North Africa,Regional indicator_North America and ANZ,Regional indicator_South Asia,Regional indicator_Southeast Asia,Regional indicator_Sub-Saharan Africa,Regional indicator_Western Europe
1005,8.374,0.891,61.520,0.748,-0.155,0.932,0.681,0.151,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1078,10.046,0.930,63.140,0.567,-0.295,0.967,0.621,0.254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67,9.402,0.698,66.600,0.614,-0.147,0.865,0.625,0.437,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
867,10.515,0.909,72.580,0.739,0.150,0.849,0.698,0.409,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
650,9.585,0.671,64.300,0.783,-0.238,0.655,0.559,0.243,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,8.276,0.765,61.200,0.964,0.088,0.821,0.799,0.408,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1746,9.279,0.824,66.300,0.800,0.161,0.823,0.864,0.197,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
76,10.754,0.945,72.400,0.935,0.274,0.368,0.811,0.214,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
205,8.998,0.805,64.200,0.877,-0.054,0.868,0.790,0.382,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# On obtient un jeu final pour lequel la colonne 'Regional indicator' a bien été encodé
# Le dataset est prêt pour l'entraînement de modèles.

In [13]:
# Enregistrement des différents jeux pour le machine learning.
X_train.to_csv('../data/X_train.csv', index=False)
X_test.to_csv('../data/X_test.csv', index=False)
y_train.to_csv('../data/y_train.csv', index=False)
y_test.to_csv('../data/y_test.csv', index=False)