In [11]:
# chargement des données préparées d'entrainement
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# chargement des données d'entrainement préparées
df_train = pd.read_csv('datasets/housing_train.csv')
df_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household,ocean_1,ocean_2,ocean_3,ocean_4,ocean_5,median_house_value
0,-1.308572,1.058769,0.828834,-1.188193,-1.4085,-0.823023,-0.245501,-1.261839,1.32211,-2.155521,0.0,0.0,0.0,1.0,0.0,237500.0
1,0.567871,-0.635401,1.048089,0.058527,-0.136146,0.127886,-0.193363,-0.146672,0.156685,-0.701974,1.0,0.0,0.0,0.0,0.0,250000.0
2,1.382643,-1.582079,-0.736547,1.393413,1.388602,1.56561,-0.393989,-0.216869,-0.088894,-0.287003,1.0,0.0,0.0,0.0,0.0,135700.0
3,-1.185122,0.890334,-0.243567,0.579701,0.367176,0.547581,0.833637,0.152921,-0.305451,-0.402009,1.0,0.0,0.0,0.0,0.0,287600.0
4,-0.17283,0.660399,1.406965,-0.671689,-0.045317,-0.356069,-1.907723,-0.961184,1.69304,0.888965,0.0,1.0,0.0,0.0,0.0,49600.0


In [12]:
# extraction de X_train et y_train
X_train = df_train.drop("median_house_value", axis=1) 
y_train = df_train["median_house_value"].to_numpy() 
print('X_train:', X_train.shape, '; y_train:', np.shape(y_train))

X_train: (16512, 15) ; y_train: (16512,)


In [16]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Lasso
print(SGDRegressor().get_params())
print(LogisticRegression().get_params())
print(KNeighborsRegressor().get_params())
print(GradientBoostingRegressor().get_params())
print(LinearRegression().get_params())
print(DecisionTreeRegressor().get_params())
print(RandomForestRegressor().get_params())
print(Ridge().get_params())
print(RidgeCV().get_params())
print(Lasso().get_params())

{'alpha': 0.0001, 'average': False, 'early_stopping': False, 'epsilon': 0.1, 'eta0': 0.01, 'fit_intercept': True, 'l1_ratio': 0.15, 'learning_rate': 'invscaling', 'loss': 'squared_loss', 'max_iter': 1000, 'n_iter_no_change': 5, 'penalty': 'l2', 'power_t': 0.25, 'random_state': None, 'shuffle': True, 'tol': 0.001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
{'alpha': 0.9, 'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'ls', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease

In [20]:
# K-fold cross-validation et GridSearchCV
pipelines = []
params = []
names = []

#
# ajouter LinearRegression
pipelines.append(Pipeline([('clf', LinearRegression())])) ### LinearRegression
params.append({'clf__normalize':[True]})
names.append('LinearRegression')

# ajouter DecisionTreeRegressor
pipelines.append(Pipeline([('clf', DecisionTreeRegressor())])) ## DecisionTreeRegressor
params.append({'clf__max_depth':np.linspace(5, 15, 5)})
names.append('DecisionTreeRegressor') 


pipelines.append(Pipeline([('clf', RandomForestRegressor())])) ## RandomForestRegressor
params.append({'clf__n_estimators': [50,100,200]})
names.append('RandomForestRegressor')

# ajouter SGDRegressor
pipelines.append(Pipeline([('clf', SGDRegressor())])) ### SGDRegressor
params.append({'clf__average': [True]})
names.append('SGDRegressor')

# ajouter KNeighborsRegressor
pipelines.append(Pipeline([('clf', KNeighborsRegressor())])) ### KNeighborsRegressor
params.append({'clf__n_neighbors':np.array([10])})
names.append('KNeighborsRegressor')

# ajouter GradientBoostingRegressor
pipelines.append(Pipeline([('clf', GradientBoostingRegressor())])) ### GradientBoostingRegressor
params.append({'clf__n_estimators':[50,100,200]})
names.append('GradientBoostingRegressor')

# ajouter Ridge
pipelines.append(Pipeline([('clf', Ridge())])) ### Ridge
params.append({'clf__alpha':[0.1]})
names.append('Ridge')

# ajouter RidgeCV
pipelines.append(Pipeline([('clf', RidgeCV())])) ### RidgeCV
params.append({'clf__alphas':np.logspace(-6, 6, 13)})
names.append('RidgeCV')

# ajouter Lasso
pipelines.append(Pipeline([('clf', Lasso())])) ### Lasso
params.append({'clf__alpha':[0.1]})
names.append('Lasso')



In [21]:
# l'enraînement avec cross-validation

#n_jobs = -1 signifie que le calcul sera distribué sur tous les CPU de l'ordinateur.

from sklearn.model_selection import KFold, GridSearchCV, cross_val_score

def model(pipeline, parameters, name, X, y):    
    cv = KFold(n_splits=5, shuffle=True, random_state=32)
    grid_obj = GridSearchCV(estimator=pipeline, param_grid=parameters, cv=cv, scoring='r2', n_jobs=-1)
    grid_obj.fit(X,y)  
    print(name, 'R2:', grid_obj.best_score_)
    estimator = grid_obj.best_estimator_
    estimator.fit(X,y) # training sur tout training dataset
    return estimator 
estimators = []
for i in range(len(pipelines)):    
    estimators.append(model(pipelines[i], params[i], names[i], X_train, y_train))

LinearRegression R2: 0.6189073881937721
DecisionTreeRegressor R2: 0.7173801873943024
RandomForestRegressor R2: 0.8085316704082766
SGDRegressor R2: 0.6171302090515194
KNeighborsRegressor R2: 0.7450882414985346
GradientBoostingRegressor R2: 0.8083450492918451
Ridge R2: 0.6188805228855699


RuntimeError: Cannot clone object RidgeCV(alphas=1e-06), as the constructor either does not set or modifies parameter alphas

In [10]:
from sklearn.metrics import mean_squared_error, r2_score 

# chargement des données d'entrainement préparées
df_test = pd.read_csv('datasets/housing_test.csv')

# extraction de X_test et y_test
X_test= df_test.drop("median_house_value", axis=1) 
y_test = df_test["median_house_value"].to_numpy() 
print('X_test:', X_test.shape, '; y_test:', np.shape(y_test))

# Evaluation

for i, estimator in enumerate(estimators):
    print('\nPerformance :', names[i])
    y_pred = estimator.predict(X_test)
    print('\n mean_squared_error :', mean_squared_error(y_test, y_pred))    
    print('\n r2_score :', r2_score(y_test, y_pred))

X_test: (4128, 15) ; y_test: (4128,)


In [28]:
# Serialize final models 
import joblib  
for i, estimator in enumerate(estimators):
    joblib.dump(estimator, names[i]+".pkl")


# chargement du modèle linear regression
# model = joblib.load(names[0]+"pkl")