In [94]:
import pandas as pd 
import numpy as np
import dill

import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = 'vscode+colab' #permette il rendering dei plot sia su vscode che su colab




In [3]:
X_train= pd.read_csv('dataset/X_train_Final.csv')
y_train= pd.read_csv('dataset/y_train.csv')

In [4]:
X_test= pd.read_csv('dataset/X_test_Final.csv')
y_test= pd.read_csv('dataset/y_test.csv')

In [5]:
y_test_id = X_test["id"]

In [6]:
X_train=X_train.drop("Sale_Price", axis=1)
X_test=X_test.drop("Sale_Price", axis=1)

In [7]:
X_train=X_train.drop("id", axis=1)
X_test=X_test.drop("id", axis=1)

In [8]:
y_train=y_train.squeeze() #converte in series
y_test=y_test.squeeze() #converte in series

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

from datetime import datetime



In [8]:
results = pd.DataFrame()

In [10]:
results = pd.read_csv('results/prediction.csv')

In [16]:
def model_evaluation(model_name, model ,X_train, y_train, X_test, y_test, log_scale):
    
    if log_scale is True :
        model.fit(X_train, np.log(y_train))
        y_predicted = np.exp(model.predict(X_test))
        
    else:
        model.fit(X_train, np.log(y_train))
        y_predicted = np.exp(model.predict(X_test))
    
    mse = mean_squared_error(y_test, y_predicted)
    rmse = mse**.5
    r2 = r2_score(y_test, y_predicted)
    mae = mean_absolute_error(y_test, y_predicted)
    
    print(f'Model Name : {model_name}')
    print(f'RMSE : {rmse}')
    print(f'R2 : {r2}')
    print(f'MAE : {mae}')
    
    results[model_name] = y_predicted   

In [97]:
def plot_importance(model):

    feature_names = X_train.columns

    feature_importance_df = pd.DataFrame(model.feature_importances_, feature_names)
    feature_importance_df = feature_importance_df.reset_index()
    feature_importance_df.columns = ['Feature', 'Importance']
    feature_importance_df_top_10 = feature_importance_df.sort_values(by='Importance', ascending=False).head(10)
    fig=px.bar(feature_importance_df,feature_importance_df_top_10.Feature, feature_importance_df_top_10.Importance)
    fig.show()

# RANDOM FOREST REGRESSOR

In [53]:
rf = RandomForestRegressor()
model_evaluation("Simple RandomForest", rf , X_train, y_train, X_test, y_test, False )

Model Name : Simple RandomForest
RMSE : 23159.098659623243
R2 : 0.9244134776085555
MAE : 15413.549030218432


In [101]:
plot_importance(rf)

## FACCIAMO TUNING DEI PARAMETRI DELLA RANDOM FOREST
### 1 TUNING

In [54]:
## Define Grid 
param_grid = {'n_estimators': [2000,4000,5000], #numero di alberi nella foresta
              'min_samples_split': [2], #numero minimo di campioni in un nodo interno per permettere lo split
              'bootstrap' : [True], #campioni con reinserimento
              'max_depth' : [5,6,7,None], #estensione del'albero, None --> estensione fino a foglie pure
              'max_features': ['sqrt', 'log2', 0.3], #numero di feature su cui sono allenati gli alberi
              } 

grid_search = model_selection.GridSearchCV(rf, #modello utilizzato
                                           param_grid, #griglia dei parametri
                                           scoring=["r2","neg_root_mean_squared_error"], #MSE MEAN SQUARE ERROR
                                           refit ="neg_root_mean_squared_error"
                                           verbose=3,
                                           n_jobs=-1, #multi processor
                                           #cv = 5 <-- default cross validation
                                           return_train_score=True)

best_forest = grid_search.fit(X_train, y_train)#Allena tanti modelli quante sono le combinazioni tra i parametri definiti in param_grid
print('Optimum parameters', best_forest.best_params_)
## show end time
print(datetime.now())

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Optimum parameters {'bootstrap': True, 'max_depth': None, 'max_features': 0.3, 'min_samples_split': 2, 'n_estimators': 5000}
2023-01-02 13:07:41.384171


In [55]:
rf_tuned_1 = best_forest.best_estimator_
model_evaluation("First Tuned RandomForest", rf_tuned_1, X_train, y_train, X_test, y_test, False)

Model Name : First Tuned RandomForest
RMSE : 23305.545642306537
R2 : 0.9234545096337934
MAE : 15246.369139768196


In [100]:
plot_importance(rf_tuned_1)

## Tuned Random Forest - Visualizziamo i valori predetti vs i valori reali osservati

In [56]:
fig =px.scatter(results, x="Observed", y="First Tuned RandomForest", hover_data=['id'],color_discrete_sequence=['red'],trendline="ols",trendline_color_override='blue',height=1000, width=1000)
fig.show()

### 2° TUNING

In [57]:
rf_tuned_2=RandomForestRegressor()

In [62]:
## Define Grid 
param_grid = {'n_estimators': [2000,4000,5000], #numero di alberi nella foresta
              'min_samples_split': [2,4,6], #numero minimo di campioni in un nodo interno per permettere lo split
              'bootstrap' : [True], #campioni con reinserimento
              'max_depth' : [None], #estensione del'albero, None --> estensione fino a foglie pure
              'max_features': [0.3], #numero di feature su cui sono allenati gli alberi
              } 

grid_search = model_selection.GridSearchCV(rf, #modello utilizzato
                                           param_grid, #griglia dei parametri
                                           scoring=["neg_root_mean_squared_error"], #MSE MEAN SQUARE ERROR
                                           refit ="neg_root_mean_squared_error",
                                           verbose=3,
                                           n_jobs=-1, #multi processor
                                           #cv = 5 <-- default cross validation
                                           return_train_score=True)

best_forest2 = grid_search.fit(X_train, y_train)#Allena tanti modelli quante sono le combinazioni tra i parametri definiti in param_grid
print('Optimum parameters', best_forest2.best_params_)
## show end time
print(datetime.now())

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Optimum parameters {'bootstrap': True, 'max_depth': None, 'max_features': 0.3, 'min_samples_split': 2, 'n_estimators': 5000}
2023-01-02 13:57:10.907581


In [64]:
rf_tuned_2 = best_forest2.best_estimator_
model_evaluation("Second Tuned RandomForest", rf_tuned_2, X_train, y_train, X_test, y_test, False)

Model Name : Second Tuned RandomForest
RMSE : 23332.64318262654
R2 : 0.9232764060712798
MAE : 15258.951195014877


In [99]:
plot_importance(rf_tuned_2)

## 2nd Tuned Random Forest - Visualizziamo i valori predetti vs i valori reali osservati

In [65]:
fig =px.scatter(results, x="Observed", y="Second Tuned RandomForest", hover_data=['id'], color_discrete_sequence=['red'],trendline="ols",trendline_color_override='blue',height=1000, width=1000)
fig.show()

# TRASFORMAZIONE LOG E PROVA DELLA RANDOM FOREST SU DATASET SENZA OUTLIERS

A questo punto come avevamo constatato in fase di visualizzazione della distribuzione di SalePrice proviamo a transformala in scala logaritmica e vediamo se la nostra miglior random forest avrà un RMSE migliore.

In [66]:
y_train_log = np.log(y_train) #trasformazione logaritmica della variabile risposta di train
y_test_log = np.log(y_test) #trasformazione logaritmica della variabile risposta di test

In [67]:
rf_log = best_forest2.best_estimator_
model_evaluation("Log RandomForest", rf_log, X_train, y_train, X_test, y_test, True)

Model Name : Log RandomForest
RMSE : 23362.31828935204
R2 : 0.9230811235516725
MAE : 15271.621443140339


## Dataset con outliers

In [68]:
X_train_out = pd.read_csv('dataset/train_data_FE_encoded_With_outliers.csv') #lettura dataset con outliers rimossi con IsolationForest
y_train_out = pd.read_csv('dataset/y_train_With_outliers.csv').squeeze()
X_train_out=X_train_out.drop(["Sale_Price", "id"], axis=1)

In [69]:
rf_out = RandomForestRegressor(bootstrap=True, max_depth=None, min_samples_split=2, n_estimators=5000, max_features=0.3)
model_evaluation("Outliers RandomForest", rf_out, X_train_out, y_train_out, X_test, y_test, False)


Model Name : Outliers RandomForest
RMSE : 23171.01342783914
R2 : 0.9243356829089742
MAE : 15151.767164334224


## Outliers Random Forest - Visualizziamo i valori predetti vs i valori reali osservati

In [70]:
fig =px.scatter(results, x="Observed", y="Outliers RandomForest", hover_data=["id"],color_discrete_sequence=['red'],trendline="ols",trendline_color_override='blue',height=1000, width=1000)
fig.show()

In [12]:
results.to_csv('results/prediction.csv', index=False)

In [11]:
#results = results[['id','Observed','Simple RandomForest','First Tuned RandomForest','Second Tuned RandomForest', 'Log RandomForest', 'Outliers RandomForest']]

Il modelli non sembrano essere migliorati molto rispetto alle altre RandomForest.  
Questo ci fa capire che essendo le random forest basate su alberi decisionali indipendenti, addestrati su campioni casuali sia delle osservazioni che delle variabili, non necessitano di una trasformazione della variabile risposta. Grazie al Bagging quindi la random forest risulta essere robusta ai valori outliers e stabile a prescindere dalle trasformazioni delle variabili

# XGBOOST REGRESSOR

In [77]:
import xgboost as xgbx
from xgboost.sklearn import XGBRegressor
from xgboost import plot_importance

# SIMPLE XGBOOST REGRESSOR

In [103]:
xgb_model = XGBRegressor(learning_rate = 0.1)
model_evaluation("Simple XGBRegressor", xgb_model ,X_train, y_train, X_test, y_test, False)


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.



Model Name : Simple XGBRegressor
RMSE : 22388.985014475802
R2 : 0.9293568802711545
MAE : 14228.529496717258


In [104]:
plot_importance(xgb_model)

In [105]:
fig =px.scatter(results, x="Observed", y="Simple XGBRegressor", hover_data=["id"],color_discrete_sequence=['green'],trendline="ols",trendline_color_override='red',height=1000, width=1000)
fig.show()

# TUNING DEI PARAMETRI DI XGBOOST

In [73]:
xgb_model_tuned = XGBRegressor()

In [74]:

param_grid = { 
              'n_estimators':[5000],#numero di alberi
              'max_depth' : [None, 3], #estensione del'albero, None --> estensione fino a foglie pure
              'colsample_bytree': [0.3], #numero di feature su cui sono allenati gli alberi
              'learning_rate' : [0.01]
              } 

grid_search = model_selection.GridSearchCV(xgb_model_tuned, #modello utilizzato
                                           param_grid, #griglia dei parametri
                                           scoring=["r2","neg_root_mean_squared_error"], #MSE MEAN SQUARE ERROR
                                           refit="neg_root_mean_squared_error", #In the multi-metric setting, you need to set this so that the final model can be fitted to that,
                                                                                #because the best hyper-parameters for the model will be decided based on a single metric only.
                                           verbose=1,
                                           n_jobs=-1, #multi processor
                                           #cv = 5 <-- default cross validation
                                           return_train_score=True)

best_xgboost_model = grid_search.fit(X_train, y_train)#Allena tanti modelli quante sono le combinazioni tra i parametri definiti in param_grid
print('Optimum parameters', best_xgboost_model.best_params_)
print('Best score:', (best_xgboost_model.best_score_))
## show end time
print(datetime.now())

Fitting 5 folds for each of 2 candidates, totalling 10 fits



pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.



Optimum parameters {'colsample_bytree': 0.3, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 5000}
Best score: -20197.110230697854
2023-01-02 14:11:44.345156


In [75]:
xgb_model_tuned = best_xgboost_model.best_estimator_
model_evaluation("Tuned XGBRegressor", xgb_model_tuned, X_train, y_train, X_test, y_test, False)


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.



Model Name : Tuned XGBRegressor
RMSE : 20375.812575003853
R2 : 0.9414898874889327
MAE : 13504.83685943895


In [98]:
plot_importance(xgb_model_tuned)

In [83]:
fig =px.scatter(results, x="Observed", y="Tuned XGBRegressor", hover_data=["id"],color_discrete_sequence=['green'],trendline="ols",trendline_color_override='blue',height=1000, width=1000)
fig.show()

Notiamo che xgb_model_tuned riesce ad avere predizioni migliori sulle case con prezzo più alto rispetto alle random forest allenate in precedenza

# KNN - K NEAREST NEIGHBORS

Per curiosità vediamo come si comporta l'algoritmo di KNN sul nostro dataset.

In [33]:
from sklearn.neighbors import KNeighborsRegressor

In [34]:
knn_model_tuned = KNeighborsRegressor()

In [107]:
param_grid = { 
              'n_neighbors': [1,3,5, 7, 9, 12,15,17],
              'weights' : ['uniform','distance'],
              'metric': ['euclidean', 'manhattan', 'cosine']
              } 

grid_search = model_selection.GridSearchCV(knn_model_tuned, #modello utilizzato
                                           param_grid, #griglia dei parametri
                                           scoring=["neg_root_mean_squared_error"], #MSE MEAN SQUARE ERROR
                                           refit="neg_root_mean_squared_error", #In the multi-metric setting, you need to set this so that the final model can be fitted to that,
                                                                                #because the best hyper-parameters for the model will be decided based on a single metric only.
                                           verbose=1,
                                           n_jobs=-1, #multi processor
                                           #cv = 5 <-- default cross validation
                                           return_train_score=True)

best_knn_model = grid_search.fit(X_train, y_train)#Allena tanti modelli quante sono le combinazioni tra i parametri definiti in param_grid
print('Optimum parameters', best_knn_model.best_params_)
print('Best score:', (best_knn_model.best_score_))
## show end time
print(datetime.now())

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Optimum parameters {'metric': 'manhattan', 'n_neighbors': 12, 'weights': 'distance'}
Best score: -40089.96631195866
2023-01-02 14:55:37.103342


In [112]:
knn_model_tuned = best_knn_model.best_estimator_
model_evaluation("KNN Regressor", knn_model_tuned, X_train, y_train, X_test, y_test, False)

Model Name : KNN Regressor
RMSE : 42378.86315055835
R2 : 0.7468956184877495
MAE : 27200.587571726584


In [113]:
fig =px.scatter(results, x="Observed", y="KNN Regressor", hover_data=["id"],color_discrete_sequence=['green'],trendline="ols",trendline_color_override='blue',height=1000, width=1000)
fig.show()

In [11]:
import dill

def save_session() :
    dill.dump_session('data_prediction.db')
    
def load_session():
    dill.load_session('data_prediction.db')

In [84]:
#save_session()

In [12]:
#load_session()