# Pipeline et modèle

In [221]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, OneHotEncoder, StandardScaler, PolynomialFeatures, OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedShuffleSplit, ShuffleSplit
from sklearn.model_selection import train_test_split, cross_val_predict

import pickle

In [222]:
df = pd.read_csv("./data.csv")

In [223]:
df['f_annee_construction'] = df['f_annee_construction'].astype(str)

In [224]:
y = df['f_prix']
X = df.drop("f_prix", axis=1)

In [225]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.90, random_state=42, stratify=df['f_annee_construction'])

num_var = ["f_cheminee","f_surface_habitable","f_nb_places","f_surface_sous_sol","f_surface_1er_etage","f_nb_salle_bain"]
cat_var = [ "f_qual","f_annee_construction","f_CuisineQual", "f_Quartier"]

cat_transformer = OneHotEncoder(handle_unknown='ignore')

num_transformer = StandardScaler()

encoder = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_var),
        ('num', num_transformer, num_var)
    ]
)

pipeline = Pipeline([
    ('enc', encoder),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', Lasso(alpha=205.43354335433543)),
])

In [226]:
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

0.8878016367226198

In [227]:
predict_train  = pipeline.predict(X_train)
predict_test  = pipeline.predict(X_test)

# Root Mean Squared Error on train and test date
print('MAE on train data: ', mean_absolute_error(y_train, predict_train))
print('MAE on test data: ',  mean_absolute_error(y_test, predict_test))

MAE on train data:  16878.931025280774
MAE on test data:  18122.14329803069


In [228]:
# import numpy as np

# models = [Lasso(tol=0.01), Ridge(), ElasticNet(), LinearRegression()]

# result = {}
# for model in models:
#     pipeline = Pipeline([
#         ('enc', encoder),
#         ('poly', PolynomialFeatures(degree=2)),
#         ('model', model),
#     ])
#     param_grid = {
#         "poly__degree":[1,2],
#         "model__alpha":np.linspace(0.1, 100.0, num=1000),
#     }
#     if str(model) == "LinearRegression()":
#         param_grid = {
#             "poly__degree":[1,2]
#         }
      
#     grid_search = RandomizedSearchCV(pipeline, param_grid, cv = 5, n_iter=1000)
#     grid_search.fit(X_train, y_train)
#     result[f'{str(model)}'] = grid_search.best_params_
#     result[f'{str(model)}']['score_train'] = grid_search.best_score_
#     result[f'{str(model)}']['score_test'] = grid_search.score(X_test, y_test)
#     print(str(model) + " : " + str(result[str(model)]))