# Pipeline et modèle

In [1]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,PolynomialFeatures,FunctionTransformer,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

pd.set_option('display.max_columns', None)

import pickle

In [2]:
df = pd.read_csv("../data/AmesHousing.csv")

Il ne faut selectionner que 10 features, on supprime Neighborhood qui est trop complexe à traiter, ainsi que bsmt Qual et garage Finish qui sont trop proches d'autres variables

In [3]:
numeric_features = ["Year Built", "Total Bsmt SF", "1st Flr SF", "Gr Liv Area", "Garage Area", "Overall Qual", "Full Bath"]
ordinal_features = [ "Exter Qual",  "Kitchen Qual"]
cat_feature = ["Neighborhood"]
all_col = numeric_features.copy()
all_col.extend(ordinal_features)
all_col.extend(cat_feature)

X = df[all_col]
y = df[["SalePrice"]]

X["Total Bsmt SF"].fillna(1052, inplace=True)
X["Garage Area"].fillna(472, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Total Bsmt SF"].fillna(1052, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Garage Area"].fillna(472, inplace=True)


In [4]:
X.columns = ['Year_Built', 'Total_Bsmt_SF', '1st_Flr_SF', 'Gr_Liv_Area','Garage_Area', 'Overall_Qual', 'Full_Bath', 'Exter_Qual',
       'Kitchen_Qual', 'Neighborhood']

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
numeric_transformer = SimpleImputer()

exter_cat = [ 'Po', 'Fa','TA', 'Gd','Ex']
kitchen_cat = [ 'Po', 'Fa','TA', 'Gd',"Ex"]

ordinal_transformer = OrdinalEncoder(categories=[exter_cat, kitchen_cat])

categorical_transformer = OneHotEncoder()

In [None]:
# poly_transformer = StandardScaler()

# numeric_imputer = SimpleImputer() 
# numeric_transformer = Pipeline([
#     ('imputer', numeric_imputer),
#     ('poly', poly_transformer)
# ])

# exter_cat = [ 'Po', 'Fa','TA', 'Gd','Ex']
# kitchen_cat = [ 'Po', 'Fa','TA', 'Gd',"Ex"]

# ordinal_transformer = OrdinalEncoder(categories=[exter_cat, kitchen_cat])

# categorical_transformer = OneHotEncoder()

In [7]:
numeric_features = ["Year_Built", "Total_Bsmt_SF", "1st_Flr_SF", "Gr_Liv_Area", "Garage_Area", "Overall_Qual", "Full_Bath"]
ordinal_features = [ "Exter_Qual",  "Kitchen_Qual"]
cat_feature = ["Neighborhood"]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('ord', ordinal_transformer, ordinal_features),
        ('cat', categorical_transformer, cat_feature)
    ]
)

In [8]:
reg = LinearRegression()

In [9]:
pipe = Pipeline([
     ('preprocessor', preprocessor),
     ('reg', reg)
])

pipe.fit(X_train, y_train) 

In [10]:
pipe.score(X_test,y_test)

predict_train  = pipe.predict(X_train)
predict_test  = pipe.predict(X_test)

# Root Mean Squared Error on train and test date
print('MAE on train data: ', mean_absolute_error(y_train, predict_train))
print('MAE on test data: ',  mean_absolute_error(y_test, predict_test))

MAE on train data:  20289.33639741511
MAE on test data:  21324.92012401977


In [11]:
filename = '../real_estate_app/main_app/static/models/finalized_model.pkl'
pickle.dump(pipe, open(filename, 'wb'))

In [12]:
pipe[:-1].get_feature_names_out()

array(['num__Year_Built', 'num__Total_Bsmt_SF', 'num__1st_Flr_SF',
       'num__Gr_Liv_Area', 'num__Garage_Area', 'num__Overall_Qual',
       'num__Full_Bath', 'ord__Exter_Qual', 'ord__Kitchen_Qual',
       'cat__Neighborhood_Blmngtn', 'cat__Neighborhood_Blueste',
       'cat__Neighborhood_BrDale', 'cat__Neighborhood_BrkSide',
       'cat__Neighborhood_ClearCr', 'cat__Neighborhood_CollgCr',
       'cat__Neighborhood_Crawfor', 'cat__Neighborhood_Edwards',
       'cat__Neighborhood_Gilbert', 'cat__Neighborhood_Greens',
       'cat__Neighborhood_GrnHill', 'cat__Neighborhood_IDOTRR',
       'cat__Neighborhood_Landmrk', 'cat__Neighborhood_MeadowV',
       'cat__Neighborhood_Mitchel', 'cat__Neighborhood_NAmes',
       'cat__Neighborhood_NPkVill', 'cat__Neighborhood_NWAmes',
       'cat__Neighborhood_NoRidge', 'cat__Neighborhood_NridgHt',
       'cat__Neighborhood_OldTown', 'cat__Neighborhood_SWISU',
       'cat__Neighborhood_Sawyer', 'cat__Neighborhood_SawyerW',
       'cat__Neighborhood_Som

In [13]:
pipe[-1].coef_[0]

array([ 2.51531081e+02,  1.64097640e+01,  6.70358468e+00,  4.65307922e+01,
        3.65359717e+01,  1.30815790e+04, -4.22489357e+03,  8.61830196e+03,
        1.41005464e+04, -2.29019844e+04, -3.57892977e+04, -2.96299365e+04,
       -2.01198435e+03,  1.51793760e+04, -7.22868200e+03,  1.85227164e+04,
       -1.05616635e+04, -4.61249038e+03, -1.76428810e+04,  7.00093129e+04,
       -1.18955926e+04, -8.72451935e+03, -1.60721428e+04, -5.02270104e+03,
       -6.56589100e+03, -1.75121358e+04, -8.46259494e+03,  4.21959161e+04,
        3.26745822e+04, -1.73871011e+04, -1.11588342e+04, -3.85752339e+03,
       -1.41174641e+04, -3.11859739e+03,  5.44955130e+04,  1.07981065e+04,
        1.03984943e+04])

In [14]:
print(f"La valeur de l'intercept est : {pipe[-1].intercept_[0]}")

La valeur de l'intercept est : -554673.7178723412


In [15]:
pd.DataFrame(dict(zip(pipe[:-1].get_feature_names_out(), pipe[-1].coef_[0])),index=["Coef"]).T.sort_values(by="Coef",ascending=False)

Unnamed: 0,Coef
cat__Neighborhood_GrnHill,70009.312851
cat__Neighborhood_StoneBr,54495.512965
cat__Neighborhood_NoRidge,42195.91613
cat__Neighborhood_NridgHt,32674.582242
cat__Neighborhood_Crawfor,18522.716399
cat__Neighborhood_ClearCr,15179.375997
ord__Kitchen_Qual,14100.546362
num__Overall_Qual,13081.578961
cat__Neighborhood_Timber,10798.106539
cat__Neighborhood_Veenker,10398.494346


In [16]:
dico = {'Year_Built': 2000, 'Total_Bsmt_SF': 1, '1st_Flr_SF': 1, 'Gr_Liv_Area': 1, 'Garage_Area': 1, 'Overall_Qual': 1, 'Full_Bath': 1, 'Exter_Qual': 'Po', 'Kitchen_Qual': 'Po', 'Neighborhood': 'Gilbert'}

In [17]:
pipe.predict(pd.DataFrame(dico,index=[0]))

array([[-47261.18055516]])