In [147]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_validate

## Baseline model

### Dummy

In [3]:
data_baseline=pd.read_csv('data_baseline.csv')

In [4]:
data_baseline.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817,72000.0,INLAND
1,-117.8,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133,274100.0,<1H OCEAN
2,-120.19,36.6,25.0,875.0,214.0,931.0,214.0,1.5536,58300.0,INLAND
3,-118.32,34.1,31.0,622.0,229.0,597.0,227.0,1.5284,200000.0,<1H OCEAN
4,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815,117900.0,INLAND


In [50]:
X = data_baseline.drop(['median_house_value'],axis=1)
y=data_baseline['median_house_value']

In [61]:
dummy = DummyRegressor(strategy='mean')
dummy.fit(X, y)

scores = cross_validate(dummy, X, y, cv=5, scoring=('r2', 'neg_root_mean_squared_error','neg_mean_absolute_error'))

average_r2 = scores['test_r2'].mean()
average_rmse = -scores['test_neg_root_mean_squared_error'].mean()
average_mae = -scores['test_neg_mean_absolute_error'].mean()
print("Average R2: ", average_r2, "Average RMSE: ", average_rmse, "Average MAE:", average_mae)

Average R2:  -0.00031332179918077687 Average RMSE:  115262.44901589816 Average MAE: 91120.04679548813


### Linear Regression

In [62]:
X = data_baseline.drop(['median_house_value','ocean_proximity'],axis=1)

In [63]:
reg = LinearRegression().fit(X, y)

scores = cross_validate(reg, X, y, cv=5, scoring=('r2', 'neg_root_mean_squared_error','neg_mean_absolute_error'))

average_r2 = scores['test_r2'].mean()
average_rmse = -scores['test_neg_root_mean_squared_error'].mean()
average_mae = -scores['test_neg_mean_absolute_error'].mean()
print("Average R2: ", average_r2, "Average RMSE: ", average_rmse, "Average MAE:", average_mae)

Average R2:  0.6363041307108181 Average RMSE:  69494.4332387663 Average MAE: 50700.8826433365


## Iteration 1

In [64]:
data_iter1=pd.read_csv('data_iter1.csv')

In [65]:
X = data_iter1.drop(['median_house_value'],axis=1)
y=data_iter1['median_house_value']

In [68]:
reg = LinearRegression().fit(X, y)

scores = cross_validate(reg, X, y, cv=5, scoring=('r2', 'neg_root_mean_squared_error','neg_mean_absolute_error'))

average_r2 = scores['test_r2'].mean()
average_rmse = -scores['test_neg_root_mean_squared_error'].mean()
average_mae = -scores['test_neg_mean_absolute_error'].mean()
print("Average R2: ", average_r2, "// Average RMSE: ", average_rmse, "// Average MAE:", average_mae)

Average R2:  0.6321248967054596 Average RMSE:  69870.59689546263 Average MAE: 51151.91355804453


In [69]:
#On remarque que le modèle a de moins bonnes performances. Il semblerait que retirer la colonne bedrooms ait impacté negativement
#Le modèle. Il faudra donc voir une autre façon de traiter cette colonne.

## Iteration 2

In [121]:
data_iter2=pd.read_csv('data_iter2.csv')

In [122]:
X = data_iter2.drop(['median_house_value'],axis=1)
y=data_iter2['median_house_value']

In [123]:
reg = LinearRegression().fit(X, y)

scores = cross_validate(reg, X, y, cv=5, scoring=('r2', 'neg_root_mean_squared_error','neg_mean_absolute_error'))

average_r2 = scores['test_r2'].mean()
average_rmse = -scores['test_neg_root_mean_squared_error'].mean()
average_mae = -scores['test_neg_mean_absolute_error'].mean()
print("Average R2: ", average_r2, "// Average RMSE: ", average_rmse, "// Average MAE:", average_mae)

Average R2:  0.6423123790933041 // Average RMSE:  68896.15761606033 // Average MAE: 50068.734467461225


In [84]:
#Le modèle gagne en performance après l'encodage de la variable qualitative

## Iteration 3

In [102]:
data_iter3=pd.read_csv('data_iter3.csv')

In [103]:
X = data_iter3.drop(['median_house_value'],axis=1)
y=data_iter3['median_house_value']

In [108]:
reg = LinearRegression().fit(X, y)

scores = cross_validate(reg, X, y, cv=5, scoring=('r2', 'neg_root_mean_squared_error','neg_mean_absolute_error'))

average_r2 = scores['test_r2'].mean()
average_rmse = -scores['test_neg_root_mean_squared_error'].mean()
average_mae = -scores['test_neg_mean_absolute_error'].mean()
print("Average R2: ", average_r2, "// Average RMSE: ", average_rmse, "// Average MAE:", average_mae)

Average R2:  0.6421881685556471 // Average RMSE:  68907.95688603455 // Average MAE: 50106.793836737634


In [107]:
#On obtient des résultats quasi-similaires, très légèrement en dessous de l'itération précédente.
#C'est peut-être donc une mauvaise piste

## Iteration 4

In [109]:
data_iter4=pd.read_csv('data_iter4.csv')

In [110]:
X = data_iter4.drop(['median_house_value'],axis=1)
y=data_iter4['median_house_value']

In [111]:
reg = LinearRegression().fit(X, y)

scores = cross_validate(reg, X, y, cv=5, scoring=('r2', 'neg_root_mean_squared_error','neg_mean_absolute_error'))

average_r2 = scores['test_r2'].mean()
average_rmse = -scores['test_neg_root_mean_squared_error'].mean()
average_mae = -scores['test_neg_mean_absolute_error'].mean()
print("Average R2: ", average_r2, "// Average RMSE: ", average_rmse, "// Average MAE:", average_mae)

Average R2:  0.6167969157984367 // Average RMSE:  59162.231002672146 // Average MAE: 43411.20537170415


In [107]:
#On obtient des résultats bien différents. Le R2 a beaucoup diminué, ce qui est une mauvaise nouvelle,
#Mais les RMSE et MAE ont beaucoup diminué, ce qui ets positif. On peut dire qu'avec cette modification, 
#notre modèle fais plus d'erreurs, mais fait des erreurs moins grandes.

#C'est donc un résultat assez encourageant. Peut-être qu'en trouvant une autre façon de traiter ces outliers, on pourrait
#améliorer toutes nos métriques

## Iteration 5

In [114]:
data_iter5=pd.read_csv('data_iter5.csv')

In [115]:
X = data_iter5.drop(['median_house_value'],axis=1)
y=data_iter5['median_house_value']

In [117]:
reg = LinearRegression().fit(X, y)

scores = cross_validate(reg, X, y, cv=5, scoring=('r2', 'neg_root_mean_squared_error','neg_mean_absolute_error'))

average_r2 = scores['test_r2'].mean()
average_rmse = -scores['test_neg_root_mean_squared_error'].mean()
average_mae = -scores['test_neg_mean_absolute_error'].mean()
print("Average R2: ", average_r2, "// Average RMSE: ", average_rmse, "// Average MAE:", average_mae)

Average R2:  0.6462588361990826 // Average RMSE:  68514.25032590503 // Average MAE: 49721.11821678291


In [138]:
#Meilleur résultat en traitant les outliers en bins. à garder

## Iteration 6

In [139]:
data_iter6_log=pd.read_csv('data_iter6_log.csv')

In [140]:
X = data_iter6_log.drop(['median_house_value'],axis=1)
y=data_iter6_log['median_house_value']

In [141]:
reg = LinearRegression().fit(X, y)

scores = cross_validate(reg, X, y, cv=5)
predictions = reg.predict(X)
predictions_original_scale = np.exp(predictions)
y_original_scale = np.exp(y)
average_r2=scores['test_score'].mean()
rmse = np.sqrt(mean_squared_error(y_original_scale, predictions_original_scale))
mae = mean_absolute_error(y_original_scale, predictions_original_scale)
print(" R2: ", average_r2, "//  RMSE: ", rmse, "//  MAE:", mae)

 R2:  0.6940770927481404 //  RMSE:  65520.571793641655 //  MAE: 45838.75451350636


In [107]:
#C'est une très bonne amélioration du modèle. On va donc la favoriser au box-cox

In [128]:
data_iter6_log=pd.read_csv('data_iter6_log.csv')

In [129]:
X = data_iter6_log.drop(['median_house_value'],axis=1)
y=data_iter6_log['median_house_value']

In [130]:
reg = LinearRegression().fit(X, y)

scores = cross_validate(reg, X, y, cv=5)
predictions = reg.predict(X)
predictions_original_scale = np.exp(predictions)
y_original_scale = np.exp(y)
average_r2=scores['test_score'].mean()
rmse = np.sqrt(mean_squared_error(y_original_scale, predictions_original_scale))
mae = mean_absolute_error(y_original_scale, predictions_original_scale)
print(" R2: ", average_r2, "//  RMSE: ", rmse, "//  MAE:", mae)

 R2:  0.6901168651421175 //  RMSE:  66526.48181120909 //  MAE: 46223.85786860529


## Iteration 7

In [142]:
data_iter7_log=pd.read_csv('data_iter7_log.csv')

In [143]:
X = data_iter7_log.drop(['median_house_value'],axis=1)
y=data_iter7_log['median_house_value']

In [148]:
reg = LinearRegression().fit(X, y)

scores = cross_validate(reg, X, y, cv=5)
predictions = reg.predict(X)
predictions_original_scale = np.exp(predictions)
y_original_scale = np.exp(y)
average_r2=scores['test_score'].mean()
rmse = np.sqrt(mean_squared_error(y_original_scale, predictions_original_scale))
mae = mean_absolute_error(y_original_scale, predictions_original_scale)
print(" R2: ", average_r2, "//  RMSE: ", rmse, "//  MAE:", mae)

 R2:  0.7068596784243815 //  RMSE:  63740.94139671754 //  MAE: 44609.07982362292


In [None]:
#On obtient la meilleure augmentation de R2 jusqu'ici, avec la transformation du log.

## Essai d'un autre modèle : random forest

In [149]:
from sklearn.ensemble import RandomForestRegressor

forest_regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
forest_regressor.fit(X, y)

In [150]:
scores = cross_validate(forest_regressor, X, y, cv=7)
predictions = forest_regressor.predict(X)
predictions_original_scale = np.exp(predictions)
y_original_scale = np.exp(y)
average_r2=scores['test_score'].mean()
rmse = np.sqrt(mean_squared_error(y_original_scale, predictions_original_scale))
mae = mean_absolute_error(y_original_scale, predictions_original_scale)
print(" R2: ", average_r2, "//  RMSE: ", rmse, "//  MAE:", mae)

 R2:  0.8421088433733603 //  RMSE:  18577.02011195424 //  MAE: 11122.446061741979
