# Model

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Cleaning" data-toc-modified-id="Cleaning-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Cleaning</a></span></li><li><span><a href="#Pipeline" data-toc-modified-id="Pipeline-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Pipeline</a></span><ul class="toc-item"><li><span><a href="#Categorical-features" data-toc-modified-id="Categorical-features-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Categorical features</a></span></li><li><span><a href="#Binary-features" data-toc-modified-id="Binary-features-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Binary features</a></span></li><li><span><a href="#Numerical-features" data-toc-modified-id="Numerical-features-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Numerical features</a></span></li><li><span><a href="#preprocessing-pipe" data-toc-modified-id="preprocessing-pipe-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>preprocessing pipe</a></span></li><li><span><a href="#output-processing" data-toc-modified-id="output-processing-2.5"><span class="toc-item-num">2.5&nbsp;&nbsp;</span>output processing</a></span></li><li><span><a href="#regression-model" data-toc-modified-id="regression-model-2.6"><span class="toc-item-num">2.6&nbsp;&nbsp;</span>regression model</a></span></li></ul></li><li><span><a href="#Training-and-evaluation" data-toc-modified-id="Training-and-evaluation-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Training and evaluation</a></span><ul class="toc-item"><li><span><a href="#Mean-regressor-baseline" data-toc-modified-id="Mean-regressor-baseline-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Mean regressor baseline</a></span></li><li><span><a href="#Cross-validation" data-toc-modified-id="Cross-validation-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Cross validation</a></span></li><li><span><a href="#Valid-evaluation" data-toc-modified-id="Valid-evaluation-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Valid evaluation</a></span></li><li><span><a href="#Train-evaluation" data-toc-modified-id="Train-evaluation-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Train evaluation</a></span></li><li><span><a href="#Error-analysis" data-toc-modified-id="Error-analysis-3.5"><span class="toc-item-num">3.5&nbsp;&nbsp;</span>Error analysis</a></span><ul class="toc-item"><li><span><a href="#over-estimated" data-toc-modified-id="over-estimated-3.5.1"><span class="toc-item-num">3.5.1&nbsp;&nbsp;</span>over estimated</a></span></li><li><span><a href="#under-estimated" data-toc-modified-id="under-estimated-3.5.2"><span class="toc-item-num">3.5.2&nbsp;&nbsp;</span>under estimated</a></span></li></ul></li></ul></li></ul></div>

In [None]:
from pandas import DataFrame, read_csv
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (StandardScaler, LabelEncoder, PolynomialFeatures,
                                   OneHotEncoder, OrdinalEncoder, FunctionTransformer,
                                   PowerTransformer)
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score
from joblib import dump, load
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

import numpy as np

In [None]:
SEED=42

In [None]:
!../scrapper/scrapper.py csv --file ./fulltrain.csv

In [None]:
df_full = read_csv('fulltrain.csv', index_col='idannonce')

## Cleaning

In [None]:
df_full.columns

In [None]:
chauffageNArows = df_full['idtypechauffage'] == 0
df_full.loc[chauffageNArows,'idtypechauffage'] = np.nan

In [None]:
codpostal33rows = df_full['codepostal'] == 33
df_full.loc[codpostal33rows,'codepostal'] = 33000

In [None]:
cuisineNArows = df_full['idtypecuisine'] == 0
df_full.loc[cuisineNArows,'idtypecuisine'] = np.nan

In [None]:
df_full['surface'] = df_full['surface'].str.replace(",", ".").astype(float)

In [None]:
nonzero_surface = df_full['surface'] != 0
df_full = df_full.loc[nonzero_surface,:]

In [None]:
notcolocation_rows = ~(df_full['description'].str.contains("([Cc]oloc)")).astype('Bool')
df_full = df_full.loc[notcolocation_rows,:]

In [None]:
df_full = df_full.drop(['id','ville', 'codeinsee','nb_photos', 'dpeL', 'description'], axis=1)

In [None]:
df_full.columns

In [None]:
df_full.shape

In [None]:
#df_full.to_csv('dataset_clean.csv', header=True, index_label=id)

## Pipeline

In [None]:
#categoricals = ['typedebien', 'ville','idtypechauffage', 'idtypecuisine','codepostal','codeinsee']
categoricals = ['typedebien', 'idtypechauffage', 'idtypecuisine', 'codepostal']
binaries = ['si_balcon','si_sdbain','si_sdEau']
#numericals = ['nb_chambres', 'nb_pieces', 'nb_photos', 'etage', 'surface', 'dpeC']
numericals = ['nb_chambres', 'nb_pieces', 'etage', 'surface', 'dpeC']
text = ['description']

### Categorical features

In [None]:
for col in categoricals:
    print(df_full[col].unique())

In [None]:
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown="ignore"))
])

### Binary features

In [None]:
for col in binaries:
    print(df_full[col].unique())

In [None]:
binary_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

### Numerical features

In [None]:
numerical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    #('poly', PolynomialFeatures(degree=2)),
    ('power',  PowerTransformer()),
    #('scaler', StandardScaler())
])

### preprocessing pipe

In [None]:
preprocess_pipe = ColumnTransformer([
    ('cat', categorical_pipe, categoricals),
    ('num', numerical_pipe, numericals),
    ('ord', binary_pipe, binaries)
])

### output processing

In [None]:
output_pipe = Pipeline([
    ('log', FunctionTransformer(func=np.log, inverse_func=np.exp)),
    ('scaler', StandardScaler())
])

### regression model

In [None]:
rdgRegressor = Ridge()
xgbRegressor = XGBRegressor(booster="gbtree")
svrRegressor = SVR(kernel='rbf', C=0.8)

In [None]:
model = Pipeline([
    ('pre', preprocess_pipe),
    ('reg', svrRegressor)
])

In [None]:
full_pipe = TransformedTargetRegressor(regressor=model, transformer=output_pipe)

In [None]:
dump(full_pipe,'pipeline-model.joblib')

In [None]:
from sklearn.model_selection import train_test_split
target_column = "prix"

X_fulltrain = df_full.drop(target_column, axis=1)
y_fulltrain = df_full[target_column]

X_train, X_valid, y_train, y_valid = train_test_split(X_fulltrain, y_fulltrain, test_size=0.2, random_state=SEED)

## Training and evaluation

In [None]:
full_pipe.fit(X_train,y_train);

### Mean regressor baseline

In [None]:
y_valid_pred = [np.mean(y_train)] * y_valid.shape[0]
r2 = r2_score(y_valid, y_valid_pred)
rmse = np.sqrt(mean_squared_error(y_valid, y_valid_pred))
rmsle = np.sqrt(mean_squared_error(np.log(y_valid), np.log(y_valid_pred)))
msle = (mean_squared_error(np.log(y_valid), np.log(y_valid_pred)))
mape = np.mean(np.abs((y_valid-y_valid_pred)/y_valid))
print(f'r2 = {r2}\nrmse = {rmse}\nmsle = {msle}\nrmsle = {rmsle}\nmape = {mape}')

### Cross validation

In [None]:
scores = cross_val_score(full_pipe, X_fulltrain, y=y_fulltrain, cv=10)
print(f'mean R2 = {np.mean(scores)} +/- {np.std(scores)}')
scores

### Valid evaluation

In [None]:
y_valid_pred = full_pipe.predict(X_valid)
r2 = r2_score(y_valid, y_valid_pred)
rmse = np.sqrt(mean_squared_error(y_valid, y_valid_pred))
rmsle = np.sqrt(mean_squared_error(np.log(y_valid), np.log(y_valid_pred)))
msle = (mean_squared_error(np.log(y_valid), np.log(y_valid_pred)))
mape = np.mean(np.abs((y_valid-y_valid_pred)/y_valid))
print(f'r2 = {r2}\nrmse = {rmse}\nmsle = {msle}\nrmsle = {rmsle}\nmape = {mape}')
grid = sns.JointGrid(y_valid, y_valid_pred)
grid = grid.plot(sns.regplot, sns.distplot)
grid.ax_joint.plot([0,2500], [0,2500], 'r');

### Train evaluation

In [None]:
y_train_pred = full_pipe.predict(X_train)
r2 = r2_score(y_train, y_train_pred)
rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmsle = np.sqrt(mean_squared_error(np.log(y_train), np.log(y_train_pred)))
msle = (mean_squared_error(np.log(y_train), np.log(y_train_pred)))
mape = np.mean(np.abs((y_train-y_train_pred)/y_train))
print(f'r2 = {r2}\nrmse = {rmse}\nmsle = {msle}\nrmsle = {rmsle}\nmape = {mape}')
grid = sns.JointGrid(y_train, y_train_pred)
grid = grid.plot(sns.regplot, sns.distplot)
grid.ax_joint.plot([0,2500], [0,2500], 'r');

### Error analysis

In [None]:
grid = sns.JointGrid(y_valid, (y_valid_pred-y_valid)/y_valid )
grid = grid.plot(sns.regplot, sns.distplot)
grid.ax_joint.plot([0,2500], [0,0], 'r');

#### over estimated

In [None]:
errors = ((y_valid_pred-y_valid)/y_valid) > 0.5
error_df = X_valid.loc[errors,:]
error_df.loc[errors,'prix'] = y_valid.loc[errors]
error_df.loc[errors,'pred'] = y_valid_pred[errors]
error_df

#### under estimated

In [None]:
errors = (y_valid_pred-y_valid)/y_valid < -0.4
error_df = X_valid.loc[errors,:]
error_df.loc[errors,'prix'] = y_valid.loc[errors]
error_df.loc[errors,'pred'] = y_valid_pred[errors]
error_df

In [None]:
now_str =  str(datetime.now()).replace(' ', '_')
model_file_name = 'realestate-model-' + now_str + '.pkl'
model_dir = './'
model_dir + model_file_name

In [None]:
dump(full_pipe, model_file_name)

In [None]:
from glob import glob
max(glob('realestate-model-*.pkl'))