In [1]:
import pandas as pd
import numpy as np

# Import des fichiers créés à l'étape précédente.
X_train=pd.read_csv('../data/X_train.csv')
X_test=pd.read_csv('../data/X_test.csv')
y_train=pd.read_csv('../data/y_train.csv')
y_test=pd.read_csv('../data/y_test.csv')

In [2]:
# Définition des métriques de performance d'une régression

# Erreur Quadratique Moyenne (MSE) : C'est la moyenne des carrés des écarts entre les valeurs prédites et les valeurs réelles. Plus ce nombre est petit, mieux c'est. 

# Erreur Absolue Moyenne (MAE) : C'est la moyenne des valeurs absolues des écarts entre les prédictions et les valeurs réelles. 

# Coefficient de Détermination (R^2) : Cette valeur indique la proportion de la variance de la variable dépendante qui est prévisible à partir des variables indépendantes.

# Erreur Quadratique Moyenne Racine (RMSE) : C'est la racine carrée de la MSE. Elle est dans les mêmes unités que la variable de réponse et peut être plus intuitive.

In [3]:
# Création et entraînement du modèle de régression linéaire
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lin_reg = LinearRegression() # Instanciation du modèle
lin_reg.fit(X_train, y_train) # Entraînement du modèle

# Prédiction sur l'ensemble de test
y_pred = lin_reg.predict(X_test)

# Calcul de l'erreur quadratique moyenne (MSE) et de la racine carrée de l'erreur quadratique moyenne (RMSE)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Affichage des résultats
print(f"Erreur quadratique moyenne (MSE) : {mse}")
print(f"Racine carrée de l'erreur quadratique moyenne (RMSE) : {rmse}")

# Coefficient de détermination R^2
r2_score = lin_reg.score(X_test, y_test)
print(f"Coefficient de détermination (R^2) : {r2_score}")

Erreur quadratique moyenne (MSE) : 0.28716596930311106
Racine carrée de l'erreur quadratique moyenne (RMSE) : 0.5358786889801749
Coefficient de détermination (R^2) : 0.7705291303436734


In [4]:
coeffs=list(lin_reg.coef_)
coeffs.insert(0, lin_reg.intercept_)

In [5]:
coeffs

[array([-0.71589978]),
 array([ 3.55728453e-01,  1.52919669e+00,  1.20249777e-02,  7.41787894e-01,
         4.37355321e-01, -4.64881471e-01,  1.21322648e+00, -8.37953836e-01,
        -8.56746322e-02, -3.96758024e-01,  3.37377609e-01, -9.88942658e-04,
         4.76158375e-01, -2.18512313e-01, -3.68040635e-01, -2.83343048e-01,
         3.06117555e-01])]

In [20]:
feats2=list(X_train.columns)
feats2.insert(0, 'intercept')

In [21]:
feats2

['intercept',
 'Log GDP per capita',
 'Social support',
 'Healthy life expectancy at birth',
 'Freedom to make life choices',
 'Generosity',
 'Perceptions of corruption',
 'Positive affect',
 'Negative affect',
 'Regional indicator_Commonwealth of Independent States',
 'Regional indicator_East Asia',
 'Regional indicator_Latin America and Caribbean',
 'Regional indicator_Middle East and North Africa',
 'Regional indicator_North America and ANZ',
 'Regional indicator_South Asia',
 'Regional indicator_Southeast Asia',
 'Regional indicator_Sub-Saharan Africa',
 'Regional indicator_Western Europe']

In [12]:
# ML-1.2/ affichage de l'intercept ainsi que les coefficients de chaque variable estimée par le modèle:

coeffs=list(lin_reg.coef_)
coeffs.insert(0, lin_reg.intercept_)

feats2=list(X_train.columns)
feats2.insert(0, 'intercept')

pd.DataFrame({'valeur estimée': coeffs}, index=feats2)

ValueError: Length of values (2) does not match length of index (18)

In [15]:
X_train

Unnamed: 0,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,Regional indicator_Commonwealth of Independent States,Regional indicator_East Asia,Regional indicator_Latin America and Caribbean,Regional indicator_Middle East and North Africa,Regional indicator_North America and ANZ,Regional indicator_South Asia,Regional indicator_Southeast Asia,Regional indicator_Sub-Saharan Africa,Regional indicator_Western Europe
0,10.798,0.820,65.400,0.820,-0.045,0.5065,0.724,0.3270,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,8.484,0.739,50.500,0.713,0.099,0.9130,0.744,0.3160,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,8.136,0.621,57.900,0.699,-0.092,0.7380,0.588,0.4480,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,8.935,0.802,62.820,0.865,0.020,0.8210,0.863,0.3490,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10.887,0.846,76.820,0.894,0.235,0.2450,0.734,0.1960,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1673,7.449,0.611,52.400,0.718,0.074,0.8740,0.513,0.4380,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1674,11.592,0.952,71.700,0.908,0.096,0.4230,0.809,0.2160,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1675,6.958,0.537,57.948,0.780,0.038,0.7290,0.687,0.2615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1676,7.686,0.818,58.200,0.618,0.291,0.9000,0.745,0.1530,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [16]:
# Après avoir entraîné le modèle lin_reg
intercept = lin_reg.intercept_
coefficients = lin_reg.coef_

# Créer une liste de noms de caractéristiques
feature_names = X_train.columns.tolist()

# Ajouter l'intercept
coeffs = [intercept] + coefficients.tolist()
feats2 = ['intercept'] + feature_names

# Créer le DataFrame
coefficients_df = pd.DataFrame({'valeur estimée': coeffs}, index=feats2)
print(coefficients_df)


ValueError: Length of values (2) does not match length of index (18)

In [17]:
X_train.columns

Index(['Log GDP per capita', 'Social support',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Generosity', 'Perceptions of corruption', 'Positive affect',
       'Negative affect',
       'Regional indicator_Commonwealth of Independent States',
       'Regional indicator_East Asia',
       'Regional indicator_Latin America and Caribbean',
       'Regional indicator_Middle East and North Africa',
       'Regional indicator_North America and ANZ',
       'Regional indicator_South Asia', 'Regional indicator_Southeast Asia',
       'Regional indicator_Sub-Saharan Africa',
       'Regional indicator_Western Europe'],
      dtype='object')