In [None]:
# Le service marketing de Walmart nous a demandé de construire un modèle d'apprentissage automatique
# capable d'estimer les ventes hebdomadaires dans leurs magasins, avec la meilleure précision possible 
#sur les prédictions faites.

EDA

In [1]:
!pip install plotly 



In [2]:
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
# setting Jedha color palette as default
pio.templates["jedha"] = go.layout.Template(
    layout_colorway=["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]
)
pio.templates.default = "jedha"
pio.renderers.default = "iframe_connected"

In [3]:
# Import dataset
print("Loading dataset...")
dataset = pd.read_csv("Walmart_Store_sales.csv")
print("...Done.")
print()

Loading dataset...
...Done.



In [4]:
# Basic stats
print("Number of rows and columns: {}".format(dataset.shape))
print()

print("Display of dataset: ")
display(dataset.head())
print()

print("Basics statistics: ")
data_desc = dataset.describe(include='all')
display(data_desc)
print()
print("Percentage of missing values: ")
display(100*dataset.isnull().sum()/dataset.shape[0])

Number of rows and columns: (150, 8)

Display of dataset: 


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,0.0,42.38,3.435,128.616064,7.47
2,17.0,27-07-2012,,0.0,,,130.719581,5.936
3,11.0,,1244390.03,0.0,84.57,,214.556497,7.346
4,6.0,28-05-2010,1644470.66,0.0,78.89,2.759,212.412888,7.092



Basics statistics: 


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
count,150.0,132,136.0,138.0,132.0,136.0,138.0,135.0
unique,,85,,,,,,
top,,07-01-2011,,,,,,
freq,,4,,,,,,
mean,9.866667,,1249536.0,0.07971,61.398106,3.320853,179.898509,7.59843
std,6.231191,,647463.0,0.271831,18.378901,0.478149,40.274956,1.577173
min,1.0,,268929.0,0.0,18.79,2.514,126.111903,5.143
25%,4.0,,605075.7,0.0,45.5875,2.85225,131.970831,6.5975
50%,9.0,,1261424.0,0.0,62.985,3.451,197.908893,7.47
75%,15.75,,1806386.0,0.0,76.345,3.70625,214.934616,8.15



Percentage of missing values: 


Store            0.000000
Date            12.000000
Weekly_Sales     9.333333
Holiday_Flag     8.000000
Temperature     12.000000
Fuel_Price       9.333333
CPI              8.000000
Unemployment    10.000000
dtype: float64

In [5]:
corr_matrix = round(dataset.corr(),3)

import plotly.figure_factory as ff

fig = ff.create_annotated_heatmap(corr_matrix.values,
                                  x = corr_matrix.columns.values.tolist(),
                                  y = corr_matrix.index.values.tolist())
fig.show()


In [6]:
px.scatter(dataset, x = 'Unemployment', y = 'Weekly_Sales')

In [7]:
px.scatter(dataset, x = 'CPI', y = 'Weekly_Sales')

In [8]:
# Univariate analysis
from plotly.subplots import make_subplots

# Distribution of each numeric variable
num_features = ['Temperature', 'Fuel_Price','CPI','Unemployment']# il boucle sur les features quantitative et fait un histogramme
#pour chacun des features 
fig1 = make_subplots(rows = len(num_features), cols = 1, subplot_titles = num_features)
for i in range(len(num_features)):
    fig1.add_trace(
        go.Histogram(
            x = dataset[num_features[i]], nbinsx = 10),
        row = i + 1,
        col = 1)
fig1.update_layout(
        title = go.layout.Title(text = "Distribution of quantitative variables", x = 0.5), showlegend = False, 
            autosize=False, height=500)
fig1.show()

In [9]:
# Univariate analysis
from plotly.subplots import make_subplots

# Barplot of each qualitative variable
cat_features = ['Store', 'Holiday_Flag']# il boucle sur la liste des feautures catégorielles et pr chaque feature il va faire un 
#barplot 
fig2 = make_subplots(rows = len(cat_features), cols = 1, subplot_titles = cat_features)
for i in range(len(cat_features)):
    
    x_coords = dataset[cat_features[i]].value_counts().index.tolist()
    y_coords = dataset[cat_features[i]].value_counts().tolist()
    
    fig2.add_trace(
        go.Bar(
            x = x_coords,
            y = y_coords),
        row = i + 1,
        col = 1)
fig2.update_layout(
        title = go.layout.Title(text = "Barplot of qualitative variables", x = 0.5), showlegend = False, 
            autosize=False, height=500)
fig2.show()

Preprocessing - Pandas 🐼🐼

In [10]:
dataset['Holiday_Flag'].value_counts()

0.0    127
1.0     11
Name: Holiday_Flag, dtype: int64

In [11]:
dataset=dataset.drop(columns="Holiday_Flag",axis=1)
dataset

Unnamed: 0,Store,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,42.38,3.435,128.616064,7.470
2,17.0,27-07-2012,,,,130.719581,5.936
3,11.0,,1244390.03,84.57,,214.556497,7.346
4,6.0,28-05-2010,1644470.66,78.89,2.759,212.412888,7.092
...,...,...,...,...,...,...,...
145,14.0,18-06-2010,2248645.59,72.62,2.780,182.442420,8.899
146,7.0,,716388.81,20.74,2.778,,
147,17.0,11-06-2010,845252.21,57.14,2.841,126.111903,
148,8.0,12-08-2011,856796.10,86.05,3.638,219.007525,


In [12]:
import datetime as dt  
dataset['Date'] = pd.to_datetime(dataset['Date'])
dataset['Date']

0     2011-02-18
1     2011-03-25
2     2012-07-27
3            NaT
4     2010-05-28
         ...    
145   2010-06-18
146          NaT
147   2010-11-06
148   2011-12-08
149   2012-04-20
Name: Date, Length: 150, dtype: datetime64[ns]

In [13]:
dataset['year'] = dataset['Date'].dt.year
dataset['month']=dataset['Date'].dt.month
dataset['day']=dataset['Date'].dt.day
dataset['dayofweek'] = dataset['Date'].dt.dayofweek
dataset=dataset.drop(columns="Date",axis=1)
dataset

Unnamed: 0,Store,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,year,month,day,dayofweek
0,6.0,1572117.54,59.61,3.045,214.777523,6.858,2011.0,2.0,18.0,4.0
1,13.0,1807545.43,42.38,3.435,128.616064,7.470,2011.0,3.0,25.0,4.0
2,17.0,,,,130.719581,5.936,2012.0,7.0,27.0,4.0
3,11.0,1244390.03,84.57,,214.556497,7.346,,,,
4,6.0,1644470.66,78.89,2.759,212.412888,7.092,2010.0,5.0,28.0,4.0
...,...,...,...,...,...,...,...,...,...,...
145,14.0,2248645.59,72.62,2.780,182.442420,8.899,2010.0,6.0,18.0,4.0
146,7.0,716388.81,20.74,2.778,,,,,,
147,17.0,845252.21,57.14,2.841,126.111903,,2010.0,11.0,6.0,5.0
148,8.0,856796.10,86.05,3.638,219.007525,,2011.0,12.0,8.0,3.0


In [14]:
def onehot_encoode(df,column,prefix):
    dummies=pd.get_dummies(dataset[column],prefix=prefix)
    df=pd.concat([df,dummies],axis=1)
    df=df.drop(column,axis=1)
    return df


In [15]:
dataset=onehot_encoode(dataset,column="Store",prefix="Sotre")
dataset

Unnamed: 0,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,year,month,day,dayofweek,Sotre_1.0,...,Sotre_11.0,Sotre_12.0,Sotre_13.0,Sotre_14.0,Sotre_15.0,Sotre_16.0,Sotre_17.0,Sotre_18.0,Sotre_19.0,Sotre_20.0
0,1572117.54,59.61,3.045,214.777523,6.858,2011.0,2.0,18.0,4.0,0,...,0,0,0,0,0,0,0,0,0,0
1,1807545.43,42.38,3.435,128.616064,7.470,2011.0,3.0,25.0,4.0,0,...,0,0,1,0,0,0,0,0,0,0
2,,,,130.719581,5.936,2012.0,7.0,27.0,4.0,0,...,0,0,0,0,0,0,1,0,0,0
3,1244390.03,84.57,,214.556497,7.346,,,,,0,...,1,0,0,0,0,0,0,0,0,0
4,1644470.66,78.89,2.759,212.412888,7.092,2010.0,5.0,28.0,4.0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,2248645.59,72.62,2.780,182.442420,8.899,2010.0,6.0,18.0,4.0,0,...,0,0,0,1,0,0,0,0,0,0
146,716388.81,20.74,2.778,,,,,,,0,...,0,0,0,0,0,0,0,0,0,0
147,845252.21,57.14,2.841,126.111903,,2010.0,11.0,6.0,5.0,0,...,0,0,0,0,0,0,1,0,0,0
148,856796.10,86.05,3.638,219.007525,,2011.0,12.0,8.0,3.0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Drop lines containing outliers (using masks)
print('Dropping outliers for Temperature...')
to_keep = dataset['Temperature'] < dataset['Temperature'].mean() + 3*dataset['Temperature'].std()
dataset = dataset.loc[to_keep,:]
print('Done. Number of lines remaining : ', dataset.shape[0])
print()

print('Dropping outliers for Fuel_Price...')
to_keep = dataset['Fuel_Price'] < dataset['Fuel_Price'].mean() + 3*dataset['Fuel_Price'].std()
dataset = dataset.loc[to_keep,:]
print('Done. Number of lines remaining : ', dataset.shape[0])
print()

print('Dropping outliers for CPI...')
to_keep = dataset['CPI'] < dataset['CPI'].mean() + 3*dataset['CPI'].std()
dataset = dataset.loc[to_keep,:]
print('Done. Number of lines remaining : ', dataset.shape[0])
print()

print('Dropping outliers for Unemployment...')
to_keep = dataset['Unemployment'] < dataset['Unemployment'].mean() + 3*dataset['Unemployment'].std()
dataset = dataset.loc[to_keep,:]
print('Done. Number of lines remaining : ', dataset.shape[0])
print()
dataset.shape

Dropping outliers for Temperature...
Done. Number of lines remaining :  132

Dropping outliers for Fuel_Price...
Done. Number of lines remaining :  119

Dropping outliers for CPI...
Done. Number of lines remaining :  111

Dropping outliers for Unemployment...
Done. Number of lines remaining :  98



(98, 29)

In [17]:
dataset=dataset.dropna(subset=["Weekly_Sales"])
dataset

Unnamed: 0,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,year,month,day,dayofweek,Sotre_1.0,...,Sotre_11.0,Sotre_12.0,Sotre_13.0,Sotre_14.0,Sotre_15.0,Sotre_16.0,Sotre_17.0,Sotre_18.0,Sotre_19.0,Sotre_20.0
0,1572117.54,59.61,3.045,214.777523,6.858,2011.0,2.0,18.0,4.0,0,...,0,0,0,0,0,0,0,0,0,0
1,1807545.43,42.38,3.435,128.616064,7.470,2011.0,3.0,25.0,4.0,0,...,0,0,1,0,0,0,0,0,0,0
4,1644470.66,78.89,2.759,212.412888,7.092,2010.0,5.0,28.0,4.0,0,...,0,0,0,0,0,0,0,0,0,0
6,695396.19,69.80,4.069,134.855161,7.658,2011.0,3.0,6.0,6.0,0,...,0,0,0,0,1,0,0,0,0,0
7,2203523.20,39.93,3.617,213.023623,6.961,2012.0,3.0,2.0,4.0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,532739.77,50.60,3.804,197.588605,8.090,2012.0,5.0,25.0,4.0,0,...,0,0,0,0,0,0,0,0,0,0
143,396968.80,78.53,2.705,214.495838,7.343,2010.0,4.0,6.0,1.0,0,...,0,0,0,0,0,0,0,0,0,0
144,424513.08,73.44,3.594,226.968844,6.034,2012.0,10.0,19.0,4.0,0,...,0,0,0,0,0,0,0,0,0,0
145,2248645.59,72.62,2.780,182.442420,8.899,2010.0,6.0,18.0,4.0,0,...,0,0,0,1,0,0,0,0,0,0


In [18]:
display(100*dataset.isnull().sum()/dataset.shape[0])

Weekly_Sales     0.000000
Temperature      0.000000
Fuel_Price       0.000000
CPI              0.000000
Unemployment     0.000000
year            11.111111
month           11.111111
day             11.111111
dayofweek       11.111111
Sotre_1.0        0.000000
Sotre_2.0        0.000000
Sotre_3.0        0.000000
Sotre_4.0        0.000000
Sotre_5.0        0.000000
Sotre_6.0        0.000000
Sotre_7.0        0.000000
Sotre_8.0        0.000000
Sotre_9.0        0.000000
Sotre_10.0       0.000000
Sotre_11.0       0.000000
Sotre_12.0       0.000000
Sotre_13.0       0.000000
Sotre_14.0       0.000000
Sotre_15.0       0.000000
Sotre_16.0       0.000000
Sotre_17.0       0.000000
Sotre_18.0       0.000000
Sotre_19.0       0.000000
Sotre_20.0       0.000000
dtype: float64

In [19]:
# Separate target variable Y from features X
target_name = 'Weekly_Sales'

print("Separating labels from features...")
Y = dataset.loc[:,target_name]
X = dataset.loc[:,[c for c in dataset.columns if c!=target_name]] # All columns are kept, except the target
print("...Done.")
print(Y.head())
print()
print(X.head())
print()

Separating labels from features...
...Done.
0    1572117.54
1    1807545.43
4    1644470.66
6     695396.19
7    2203523.20
Name: Weekly_Sales, dtype: float64

   Temperature  Fuel_Price         CPI  Unemployment    year  month   day  \
0        59.61       3.045  214.777523         6.858  2011.0    2.0  18.0   
1        42.38       3.435  128.616064         7.470  2011.0    3.0  25.0   
4        78.89       2.759  212.412888         7.092  2010.0    5.0  28.0   
6        69.80       4.069  134.855161         7.658  2011.0    3.0   6.0   
7        39.93       3.617  213.023623         6.961  2012.0    3.0   2.0   

   dayofweek  Sotre_1.0  Sotre_2.0  ...  Sotre_11.0  Sotre_12.0  Sotre_13.0  \
0        4.0          0          0  ...           0           0           0   
1        4.0          0          0  ...           0           0           1   
4        4.0          0          0  ...           0           0           0   
6        6.0          0          0  ...           0          

In [21]:
# Convert pandas DataFrames to numpy arrays before using scikit-learn
print("Convert pandas DataFrames to numpy arrays...")
X = X.values
Y = Y.tolist()
print("...Done")
print(X[0:5,:])
print()
print(Y[0:5])

Convert pandas DataFrames to numpy arrays...
...Done
[[5.96100000e+01 3.04500000e+00 2.14777523e+02 6.85800000e+00
  2.01100000e+03 2.00000000e+00 1.80000000e+01 4.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [4.23800000e+01 3.43500000e+00 1.28616064e+02 7.47000000e+00
  2.01100000e+03 3.00000000e+00 2.50000000e+01 4.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [7.88900000e+01 2.75900000e+00 2.12412888e+02 7.09200000e+00
  2.01000000e+0

Preprocessing - Scikit-Learn

In [22]:
# First : always divide dataset into train set & test set !!
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [23]:
# Create pipeline for numeric features
numeric_features = [0,1,2,3,4,5,6,7] # Positions of numeric columns in X_train/X_test
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # missing values will be replaced by columns' mean
    ('scaler', StandardScaler())
])


In [24]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = numeric_transformer 

In [25]:
# Preprocessings on train set (C'est là où on va appliquer les préproceessing)
print("Performing preprocessings on train set...")
print(X_train[0:5,:])
X_train =preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5,:])
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test[0:5,:])
X_test =preprocessor.transform(X_test) 
print('...Done.')
print(X_test[0:5,:])
print()

Performing preprocessings on train set...
[[6.17900000e+01 2.71100000e+00 1.89523128e+02 6.86800000e+00
  2.01000000e+03 9.00000000e+00 7.00000000e+00 1.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [6.91700000e+01 3.59400000e+00 2.24019287e+02 5.42200000e+00
  2.01200000e+03 1.00000000e+01 1.90000000e+01 4.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [3.32600000e+01 3.78900000e+00 1.33958742e+02 7.77100000e+00
  2.01100000e+03 3.0000000

Model training

In [26]:
# Train model
print("Train model...")
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
print("...Done.")
#Le modèle détermine les meilleurs coefficients pr prédire Y . A ce moment là les coef sont figé pr pouvoir faire des prédictions 

Train model...
...Done.


Prédictions

In [27]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = regressor.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

Predictions on training set...
...Done.
[ 636335.23233982  354479.23233982 1276463.23233982  869807.23233982
 1522991.23233982 1508271.23233982 1969711.23233982  638383.23233982
  910511.23233982 1119919.23233982 2117679.23233982  613039.23233982
 2133551.23233982  640431.23233982  488111.23233982  807727.23233982
  624815.23233982 1604271.23233982  194863.23233982  573871.23233982
 1855919.23233982 2104111.23233982 1130799.23233982 1457967.23233982
 2056623.23233982 1961135.23233982  440623.23233982 2040751.23233982
  984367.23233982 1604527.23233982 1997743.23233982 1656111.23233982
 1544751.23233982 1874095.23233982  341039.23233982  466607.23233982
  864175.23233982 1474863.23233982 2031535.23233982 2064175.23233982
  485807.23233982 1973935.23233982 1549231.23233982  405679.23233982
  265775.23233982  473775.23233982  415535.23233982 1808047.23233982
 1985583.23233982  409519.23233982 2077999.23233982 1871023.23233982
  835887.23233982 1557807.23233982  483759.23233982  374191.232

In [28]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = regressor.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on test set...
...Done.
[1575087.23233982  638127.23233982 1877423.23233982 1765295.23233982
  417455.23233982  232367.23233982 1247407.23233982 2097327.23233982
 2060079.23233982 1355183.23233982 1153455.23233982  968239.23233982
  439471.23233982  364207.23233982  482479.23233982 1376431.23233982
 2048559.23233982  518703.23233982]



In [29]:
# Print R^2 scores
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.9863346233700726
R2 score on test set :  0.9261458137362162


In [30]:
regressor.coef_

array([ 8.06201186e+03, -5.28479555e+04,  9.39138396e+05,  4.73860993e+04,
       -2.43577026e+04, -6.85321826e+02, -4.39558426e+04, -1.65491223e+04,
       -4.18460141e+17, -3.46809477e+17, -4.18460141e+17, -3.02546182e+17,
       -3.46809477e+17, -3.02546182e+17, -3.84883159e+17, -3.46809477e+17,
       -3.02546182e+17, -3.02546182e+17, -1.77188560e+17,  1.24697570e+11,
       -4.18460141e+17, -3.46809477e+17, -3.02546182e+17, -2.48811542e+17,
       -3.02546182e+17, -3.46809477e+17, -3.46809477e+17, -3.46809477e+17])

In [31]:
coefs = pd.DataFrame( index=dataset.columns[1:29],data = regressor.coef_.transpose(), columns=["coefficients"])
coefs

Unnamed: 0,coefficients
Temperature,8062.012
Fuel_Price,-52847.96
CPI,939138.4
Unemployment,47386.1
year,-24357.7
month,-685.3218
day,-43955.84
dayofweek,-16549.12
Sotre_1.0,-4.184601e+17
Sotre_2.0,-3.468095e+17


In [32]:
coefs.apply(lambda x : abs(x)).sort_values(by="coefficients", ascending=False)

Unnamed: 0,coefficients
Sotre_3.0,4.184601e+17
Sotre_1.0,4.184601e+17
Sotre_13.0,4.184601e+17
Sotre_7.0,3.848832e+17
Sotre_5.0,3.468095e+17
Sotre_8.0,3.468095e+17
Sotre_2.0,3.468095e+17
Sotre_20.0,3.468095e+17
Sotre_14.0,3.468095e+17
Sotre_18.0,3.468095e+17


In [33]:
# Perform 5-fold cross-validation to evaluate the generalized R2 score obtained with a Ridge model
print("5-fold cross-validation...")
regressor = Ridge()
scores = cross_val_score(regressor, X_train, Y_train, cv=5)
print('The cross-validated R2-score is : ', scores.mean())
print('The standard deviation is : ', scores.std())
# on va faire une validation croisé en laissant la valeur de alpha par  défaut 1, la VC nous permet juste de voir combien varie 
# le score en modifiant le test set . Je fais un premer train  test split et j'entarine mon modèle rdige, je fais un deuxième train test split j'entaine 
# mon modèle ridge je fais ça 5 fois je me retrouve avec une liste de 5 score validation après on calcule le score moyen et l'écart type sur ces scores 
# Je sais que maintenant sur un modèle de regression régularisé ridge sur ces data vont me donner un R² score d'environ 0.93 et que ça va varier de 0.07

5-fold cross-validation...
The cross-validated R2-score is :  0.9348903288351738
The standard deviation is :  0.07024519005078439


In [34]:
# Perform grid search
print("Grid search...")
regressor = Ridge()
# Grid of values to be tested
params = {
    'alpha': [0.0, 0.1,0.2,0.5, 1.0] # 0 corresponds to no regularization
}
gridsearch = GridSearchCV(regressor, param_grid = params, cv = 5) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best R2 score : ", gridsearch.best_score_)
#dans notre cas le meilleur modèle est non régularisé 

Grid search...
...Done.
Best hyperparameters :  {'alpha': 0.1}
Best R2 score :  0.9399724051482421


In [35]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = gridsearch.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

Predictions on training set...
...Done.
[ 620335.97296356  309403.92729337 1306863.44536843  872810.60715526
 1555291.0827927  1520689.3393035  1996886.33394813  623781.24043161
  909475.20561268 1075630.37936053 2130519.34567297  622176.28611263
 2101921.71090565  628434.01138872  494971.64336836  853056.43648582
  628919.19254436 1569333.64433264  234410.14408508  574786.08228052
 1904824.22938533 2094220.91120991 1142430.17060881 1468439.31819446
 2035127.38099079 1970777.82635563  385273.05758797 2055990.86717098
  984816.01940495 1609540.45224659 2026254.86956464 1665542.57247662
 1547456.8515523  1865510.05885405  371716.48694771  466382.30294176
  840528.53539449 1448669.58626302 2038652.20904495 2001116.54273861
  492154.53769134 1974385.05687767 1565911.2972906   401679.80209836
  294711.14382616  482542.68309695  431387.73676426 1763793.3371738
 2006707.91298359  379152.84477892 2065028.86656973 1916896.82980436
  813620.29766416 1551800.90495085  480918.23403255  358551.6512

In [36]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = gridsearch.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on test set...
...Done.
[1569789.46712244  630430.60662674 1775920.35969349 1768118.71584925
  396823.82159258  262568.04484524 1210562.80397291 2094161.87956313
 2075375.91855058 1375227.74315882 1130723.43698748  991845.03797375
  421097.92345462  402873.94504976  516569.35115762 1361408.37284943
 2059406.89813874  508268.84484293]



In [37]:
# Print R^2 scores
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.9848891707402065
R2 score on test set :  0.9307070546185502


In [31]:
# sans le grid search le R² vallait 0.9348903288351738 après le grid search mon R² sur le train set s'est amélioré 0.9848891707402065
#vmais mon erreur généralisée n'est que de 0.9307070546185502

In [38]:
lasso1 = Lasso(alpha = 1)
lasso001 = Lasso(alpha = 0.01)
lasso00001 = Lasso(alpha = 0.0001)

In [39]:
lasso1.fit(X_train, Y_train)
lasso001.fit(X_train, Y_train)
lasso00001.fit(X_train, Y_train)


Objective did not converge. You might want to increase the number of iterations. Duality gap: 210653586753.1913, tolerance: 3009808860.5723906


Objective did not converge. You might want to increase the number of iterations. Duality gap: 211351880754.41736, tolerance: 3009808860.5723906


Objective did not converge. You might want to increase the number of iterations. Duality gap: 211358838035.44498, tolerance: 3009808860.5723906



Lasso(alpha=0.0001)

In [40]:
lasso1.score(X_train, Y_train),
lasso001.score(X_train, Y_train),
lasso00001.score(X_train,Y_train)

0.9859553265534599

In [41]:
lasso1.score(X_test, Y_test),
lasso001.score(X_test, Y_test),
lasso00001.score(X_test,Y_test)

0.9290277855532766