# Model

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Cleaning" data-toc-modified-id="Cleaning-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Cleaning</a></span></li><li><span><a href="#Pipeline" data-toc-modified-id="Pipeline-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Pipeline</a></span><ul class="toc-item"><li><span><a href="#Categorical-features" data-toc-modified-id="Categorical-features-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Categorical features</a></span></li><li><span><a href="#Binary-features" data-toc-modified-id="Binary-features-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Binary features</a></span></li><li><span><a href="#Numerical-features" data-toc-modified-id="Numerical-features-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Numerical features</a></span></li><li><span><a href="#preprocessing-pipe" data-toc-modified-id="preprocessing-pipe-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>preprocessing pipe</a></span></li><li><span><a href="#regression-model" data-toc-modified-id="regression-model-2.5"><span class="toc-item-num">2.5&nbsp;&nbsp;</span>regression model</a></span></li></ul></li><li><span><a href="#Training-and-evaluation" data-toc-modified-id="Training-and-evaluation-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Training and evaluation</a></span></li></ul></div>

In [None]:
from pandas import DataFrame, read_csv
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (StandardScaler, LabelEncoder,
                                   OneHotEncoder, OrdinalEncoder, FunctionTransformer,
                                   PowerTransformer)
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns

import numpy as np

In [None]:
#!../scrapper/scrapper.py csv --file ./fulltrain.csv 

In [None]:
df_full = read_csv('fulltrain.csv')

## Cleaning

In [None]:
df_full.columns

In [None]:
chauffageNArows = df_full['idtypechauffage'] == 0
df_full.loc[chauffageNArows,'idtypechauffage'] = np.nan

In [None]:
codpostal33rows = df_full['codepostal'] == 33
df_full.loc[codpostal33rows,'codepostal'] = 33000

In [None]:
cuisineNArows = df_full['idtypecuisine'] == 0
df_full.loc[cuisineNArows,'idtypecuisine'] = np.nan

In [None]:
df_full['surface'] = df_full['surface'].str.replace(",", ".").astype(float)

## Pipeline

In [None]:
categoricals = ['typedebien', 'ville','idtypechauffage', 'idtypecuisine','codepostal','codeinsee']
binaries = ['si_balcon','si_sdbain','si_sdEau']
numericals = ['nb_chambres', 'nb_pieces', 'nb_photos', 'etage', 'surface', 'dpeC']
texte = ['description']

### Categorical features

In [None]:
for col in categoricals:
    print(df_full[col].unique())

In [None]:
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown="ignore"))
])

### Binary features

In [None]:
for col in binaries:
    print(df_full[col].unique())

In [None]:
binary_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

### Numerical features

In [None]:
numerical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler())
])

### preprocessing pipe

In [None]:
preprocess_pipe = ColumnTransformer([
    ('cat', categorical_pipe, categoricals),
    ('num', numerical_pipe, numericals),
    ('ord', binary_pipe, binaries)
])

### regression model

In [None]:
regressor = Ridge()

In [None]:
model = Pipeline([
    ('pre', preprocess_pipe),
    ('reg', regressor)
])

In [None]:
from sklearn.model_selection import train_test_split
target_column = "prix"

X_fulltrain = df_full.drop(target_column, axis=1)
y_fulltrain = df_full[target_column]

X_train, X_valid, y_train, y_valid = train_test_split(X_fulltrain, y_fulltrain, test_size=0.2, random_state=42)

## Training and evaluation

In [None]:
model.fit(X_train,y_train);

In [None]:
y_valid_pred = model.predict(X_valid)
r2 = r2_score(y_valid, y_valid_pred)
rmse = np.sqrt(mean_squared_error(y_valid, y_valid_pred))
print(f'r2={r2}\nrmse={rmse}')
sns.scatterplot(y_valid, y_valid_pred);