DROPNA Method

In [397]:
import pandas as pd

In [398]:
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error
import numpy as np


In [399]:
df = pd.read_csv("house_prices_selection.csv")

In [400]:
df.shape

(1460, 32)

Spliting - numerical / categorical data

In [401]:
df_dropna = df.copy()

In [402]:
numerical_columns = df_dropna.select_dtypes(include=["number"]).columns.tolist()

In [403]:
categorical_columns = df_dropna.select_dtypes(include=["object", "category"]).columns.tolist()

In [404]:
print(numerical_columns)

['EVI', 'ZoningScore', 'Price_per_m^2', 'PDI', 'SalePrice', 'LotFrontage', 'LotArea', 'WoodDeckSF', 'OpenPorchSF', 'FullBath', 'GarageCars', 'GarageArea', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'GrLivArea', 'TotalBsmtSF', '1stFlrSF']


In [405]:
print(categorical_columns)

['KitchenQual', 'PavedDrive', 'BsmtQual', 'CentralAir', 'Foundation']


In [406]:
df_dropna.isnull().any()


EVI                  False
ZoningScore          False
MSZoning_RL          False
LotConfig_CulDSac    False
LandContour_HLS      False
LotShape_IR2         False
Condition1_Norm      False
Condition1_Feedr     False
LotShape_Reg         False
MSZoning_RM          False
Price_per_m^2        False
PDI                  False
SalePrice            False
LotFrontage           True
LotArea              False
WoodDeckSF           False
OpenPorchSF          False
FullBath             False
GarageCars           False
GarageArea           False
KitchenQual          False
PavedDrive           False
OverallQual          False
YearBuilt            False
YearRemodAdd         False
MasVnrArea            True
GrLivArea            False
TotalBsmtSF          False
1stFlrSF             False
BsmtQual              True
CentralAir           False
Foundation           False
dtype: bool

In [407]:
df_dropna.isnull().sum()


EVI                    0
ZoningScore            0
MSZoning_RL            0
LotConfig_CulDSac      0
LandContour_HLS        0
LotShape_IR2           0
Condition1_Norm        0
Condition1_Feedr       0
LotShape_Reg           0
MSZoning_RM            0
Price_per_m^2          0
PDI                    0
SalePrice              0
LotFrontage          259
LotArea                0
WoodDeckSF             0
OpenPorchSF            0
FullBath               0
GarageCars             0
GarageArea             0
KitchenQual            0
PavedDrive             0
OverallQual            0
YearBuilt              0
YearRemodAdd           0
MasVnrArea             8
GrLivArea              0
TotalBsmtSF            0
1stFlrSF               0
BsmtQual              37
CentralAir             0
Foundation             0
dtype: int64

In [408]:

df_dropna = df_dropna.dropna(axis=0)

In [409]:
df_dropna.isnull().sum()


EVI                  0
ZoningScore          0
MSZoning_RL          0
LotConfig_CulDSac    0
LandContour_HLS      0
LotShape_IR2         0
Condition1_Norm      0
Condition1_Feedr     0
LotShape_Reg         0
MSZoning_RM          0
Price_per_m^2        0
PDI                  0
SalePrice            0
LotFrontage          0
LotArea              0
WoodDeckSF           0
OpenPorchSF          0
FullBath             0
GarageCars           0
GarageArea           0
KitchenQual          0
PavedDrive           0
OverallQual          0
YearBuilt            0
YearRemodAdd         0
MasVnrArea           0
GrLivArea            0
TotalBsmtSF          0
1stFlrSF             0
BsmtQual             0
CentralAir           0
Foundation           0
dtype: int64

In [410]:
df_dropna.shape

(1164, 32)

In [411]:
X_dropna = df_dropna.drop('SalePrice', axis=1)
y_dropna = df_dropna['SalePrice']


In [412]:
X_train, X_test, y_train, y_test = train_test_split(
    X_dropna, y_dropna, test_size=0.2, random_state=42
)

In [413]:
numerical_features = X_train.select_dtypes(include=["number"]).columns.tolist()
categorical_features = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

In [414]:
categorical_transformer = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])






In [415]:
preprocessor = ColumnTransformer([
    ('num', 'passthrough', numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

In [416]:
model = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', LinearRegression())
])

In [417]:

model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [418]:
rmse_dropna = np.sqrt(mean_squared_error(y_test, predictions))
print("RMSE:", rmse_dropna)

RMSE: 35777.823749415664


SimpleImputer Method

In [419]:
df_simple_imputer = df.copy()

In [420]:
df.shape

(1460, 32)

In [421]:
df_simple_imputer.shape

(1460, 32)

In [422]:
df.isnull().sum()


EVI                    0
ZoningScore            0
MSZoning_RL            0
LotConfig_CulDSac      0
LandContour_HLS        0
LotShape_IR2           0
Condition1_Norm        0
Condition1_Feedr       0
LotShape_Reg           0
MSZoning_RM            0
Price_per_m^2          0
PDI                    0
SalePrice              0
LotFrontage          259
LotArea                0
WoodDeckSF             0
OpenPorchSF            0
FullBath               0
GarageCars             0
GarageArea             0
KitchenQual            0
PavedDrive             0
OverallQual            0
YearBuilt              0
YearRemodAdd           0
MasVnrArea             8
GrLivArea              0
TotalBsmtSF            0
1stFlrSF               0
BsmtQual              37
CentralAir             0
Foundation             0
dtype: int64

In [423]:
df_simple_imputer.isnull().sum()


EVI                    0
ZoningScore            0
MSZoning_RL            0
LotConfig_CulDSac      0
LandContour_HLS        0
LotShape_IR2           0
Condition1_Norm        0
Condition1_Feedr       0
LotShape_Reg           0
MSZoning_RM            0
Price_per_m^2          0
PDI                    0
SalePrice              0
LotFrontage          259
LotArea                0
WoodDeckSF             0
OpenPorchSF            0
FullBath               0
GarageCars             0
GarageArea             0
KitchenQual            0
PavedDrive             0
OverallQual            0
YearBuilt              0
YearRemodAdd           0
MasVnrArea             8
GrLivArea              0
TotalBsmtSF            0
1stFlrSF               0
BsmtQual              37
CentralAir             0
Foundation             0
dtype: int64

In [424]:

X = df_simple_imputer.drop('SalePrice', axis=1)  
y = df_simple_imputer['SalePrice']               




In [425]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [426]:
numerical_columns = X_train.select_dtypes(include=["number"]).columns.tolist()
categorical_columns = X_train.select_dtypes(include=["object", "category"]).columns.tolist()


In [427]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [428]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)


In [429]:

model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('regressor', LinearRegression())
])


In [430]:
model.fit(X_train, y_train)
predictions = model.predict(X_test)


In [431]:
rmse_simple_imputer = np.sqrt(mean_squared_error(y_test, predictions))
print("RMSE:", rmse_simple_imputer)

RMSE: 33925.979571605385


Conclusions

In [432]:
print(rmse_dropna)

35777.823749415664


In [433]:
print(rmse_simple_imputer)

33925.979571605385


In conclusion, the method using SimpleImputer performed better, because the RMSE is lower, meaning it is closer to the real values. It is also a better method than using the dropna one, because it doesn't remove missing data, but uses values to fill them (no data from the original dataset is lost, which helps the RMSE to stay low).

LAB - November 18th. Adding DecisionTreeRegressor and RandomForestRegressor + Cross Validation (GridSearch). Comparing all 3 models (Linear Regresion, DecisionTreeRegressor and RandomForestRegressor).

In [434]:
df = pd.read_csv("house_prices_selection.csv")

In [435]:
df_backup = df.copy()


In [436]:
df_backup.shape

(1460, 32)

In [437]:
X = df_backup.drop('SalePrice', axis=1)  
y = df_backup['SalePrice']


In [438]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [439]:
numerical_columns = X_train.select_dtypes(include=["number"]).columns.tolist()
categorical_columns = X_train.select_dtypes(include=["object", "category"]).columns.tolist()


In [440]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [441]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)


In [442]:
from sklearn.tree import DecisionTreeRegressor


In [443]:
decision_tree_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),          
    ('regressor', DecisionTreeRegressor(random_state=42))
])

In [444]:
decision_tree_pipeline.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [445]:
decision_tree_predictions = decision_tree_pipeline.predict(X_test)


In [446]:
y_pred = decision_tree_pipeline.predict(X_test)


In [447]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [448]:
rmse_decision_tree = np.sqrt(mean_squared_error(y_test, y_pred))
mae_decision_tree = mean_absolute_error(y_test, y_pred)
r2_decision_tree = r2_score(y_test, y_pred)

print("RMSE:", rmse_decision_tree)
print("MAE:", mae_decision_tree)
print("R²:", r2_decision_tree)

RMSE: 33311.39541688194
MAE: 21482.640410958906
R²: 0.855332219954333


Hyperparameter tuning - DecisionTree

In [449]:
"""
param_grid_dt = {
    # Limitează cât de adânc poate crește arborele pentru a evita memorarea
    'regressor__max_depth': [None, 5, 10, 15, 20], #primele 2 
    
    # Numărul minim de mostre necesare pentru a împărți un nod intern
    'regressor__min_samples_split': [2, 5, 10], #primele 2
    
    # Cel mai important pentru regularizare: numărul minim de mostre într-o frunză
    'regressor__min_samples_leaf': [1, 2, 4, 8], #primele 2
    
    # Câte feature-uri să ia în calcul la fiecare split (reduce varianța)
    'regressor__max_features': ['sqrt', 'log2', None], # pastrez una
    
    # Funcția de măsurare a calității split-ului
    'regressor__criterion': ['squared_error'] # pastrez una 
    
# de sters None

"""

"\nparam_grid_dt = {\n    # Limitează cât de adânc poate crește arborele pentru a evita memorarea\n    'regressor__max_depth': [None, 5, 10, 15, 20], #primele 2 \n\n    # Numărul minim de mostre necesare pentru a împărți un nod intern\n    'regressor__min_samples_split': [2, 5, 10], #primele 2\n\n    # Cel mai important pentru regularizare: numărul minim de mostre într-o frunză\n    'regressor__min_samples_leaf': [1, 2, 4, 8], #primele 2\n\n    # Câte feature-uri să ia în calcul la fiecare split (reduce varianța)\n    'regressor__max_features': ['sqrt', 'log2', None], # pastrez una\n\n    # Funcția de măsurare a calității split-ului\n    'regressor__criterion': ['squared_error'] # pastrez una \n\n# de sters None\n\n"

In [450]:
# adjusting/choosing the parameters
param_grid_dt = {
    # Limitează cât de adânc poate crește arborele pentru a evita memorarea
    'regressor__max_depth': [5, 10],
    
    # Numărul minim de mostre necesare pentru a împărți un nod intern
    'regressor__min_samples_split': [2, 5],
    
    # Cel mai important pentru regularizare: numărul minim de mostre într-o frunză
    'regressor__min_samples_leaf': [1, 2],
    
    # Câte feature-uri să ia în calcul la fiecare split (reduce varianța)
    'regressor__max_features': ['sqrt'],
    
    # Funcția de măsurare a calității split-ului
    'regressor__criterion': ['squared_error'] 
} 





In [451]:
from sklearn.model_selection import GridSearchCV


In [452]:
grid_dt = GridSearchCV(
    estimator=decision_tree_pipeline,
    param_grid=param_grid_dt,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=1
)


In [453]:
grid_dt.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'regressor__criterion': ['squared_error'], 'regressor__max_depth': [5, 10], 'regressor__max_features': ['sqrt'], 'regressor__min_samples_leaf': [1, 2], ...}"
,scoring,'neg_mean_squared_error'
,n_jobs,1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [454]:
print("Optimal Decision Tree parameters:", grid_dt.best_params_)


Optimal Decision Tree parameters: {'regressor__criterion': 'squared_error', 'regressor__max_depth': 10, 'regressor__max_features': 'sqrt', 'regressor__min_samples_leaf': 2, 'regressor__min_samples_split': 2}


In [455]:
y_pred_dt_best = grid_dt.predict(X_test)


In [456]:
rmse_decision_tree_tuned = np.sqrt(mean_squared_error(y_test, y_pred_dt_best))
mae_decision_tree_tuned = mean_absolute_error(y_test, y_pred_dt_best)
r2_decision_tree_tuned = r2_score(y_test, y_pred_dt_best)

In [457]:
print("Decision Tree - tuned - RMSE:",rmse_decision_tree_tuned)
print("Decision Tree - tuned - MAE:",mae_decision_tree_tuned)
print("Decision Tree - tuned - R2:", r2_decision_tree_tuned)

Decision Tree - tuned - RMSE: 39485.37755365791
Decision Tree - tuned - MAE: 26115.958851228843
Decision Tree - tuned - R2: 0.7967368012205225


RANDOM FOREST

In [458]:
from sklearn.ensemble import RandomForestRegressor


In [459]:
random_forest_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [460]:
random_forest_pipeline.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [461]:
y_pred = random_forest_pipeline.predict(X_test)


In [462]:
rmse_random_forest = np.sqrt(mean_squared_error(y_test, y_pred))
mae_random_forest = mean_absolute_error(y_test, y_pred)
r2_random_forest = r2_score(y_test, y_pred)

print("Random Forest - RMSE:", rmse_random_forest)
print("Random Forest - MAE:", mae_random_forest)
print("Random Forest - R2:", r2_random_forest)

Random Forest - RMSE: 27705.792144116458
Random Forest - MAE: 16296.519760273974
Random Forest - R2: 0.8999246058643854


Hyperparameter tuning - RandomForestRegressor

In [463]:
"""
param_grid_rf = {
    # Numărul de arbori (mai mulți sunt de obicei mai buni, dar durează mai mult)
    'regressor__n_estimators': [100, 300, 500],
    
    # Adâncimea maximă a fiecărui arbore
    'regressor__max_depth': [None, 10, 20, 30],
    
    # Previne crearea de frunze cu prea puține date (ex: o singură casă cu preț atipic)
    'regressor__min_samples_leaf': [1, 2, 4],
    
    # Câte date sunt necesare pentru un nou split
    'regressor__min_samples_split': [2, 5, 10],
    
    # Critic pentru Random Forest: 'sqrt' este adesea cel mai bun pentru a decorrela arborii
    'regressor__max_features': ['sqrt', 'log2', None],
   
    # Dacă folosim bootstrapping (eșantionare cu înlocuire)
    'regressor__bootstrap': [True]
}
"""

"\nparam_grid_rf = {\n    # Numărul de arbori (mai mulți sunt de obicei mai buni, dar durează mai mult)\n    'regressor__n_estimators': [100, 300, 500],\n\n    # Adâncimea maximă a fiecărui arbore\n    'regressor__max_depth': [None, 10, 20, 30],\n\n    # Previne crearea de frunze cu prea puține date (ex: o singură casă cu preț atipic)\n    'regressor__min_samples_leaf': [1, 2, 4],\n\n    # Câte date sunt necesare pentru un nou split\n    'regressor__min_samples_split': [2, 5, 10],\n\n    # Critic pentru Random Forest: 'sqrt' este adesea cel mai bun pentru a decorrela arborii\n    'regressor__max_features': ['sqrt', 'log2', None],\n\n    # Dacă folosim bootstrapping (eșantionare cu înlocuire)\n    'regressor__bootstrap': [True]\n}\n"

In [464]:

# adjusting parameters

param_grid_rf = {
    # Numărul de arbori (mai mulți sunt de obicei mai buni, dar durează mai mult)
    'regressor__n_estimators': [100, 300],
    
    # Adâncimea maximă a fiecărui arbore
    'regressor__max_depth': [ 10, 20],
    
    # Previne crearea de frunze cu prea puține date (ex: o singură casă cu preț atipic)
    'regressor__min_samples_leaf': [1, 2],
    
    # Câte date sunt necesare pentru un nou split
    'regressor__min_samples_split': [2, 5],
    
    # Critic pentru Random Forest: 'sqrt' este adesea cel mai bun pentru a decorrela arborii
    'regressor__max_features': ['sqrt'],
   
    # Dacă folosim bootstrapping (eșantionare cu înlocuire)
    'regressor__bootstrap': [True]
}


In [465]:
grid_rf = GridSearchCV(
    estimator=random_forest_pipeline,
    param_grid=param_grid_rf,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=1 )

In [466]:
grid_rf.fit(X_train, y_train)


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'regressor__bootstrap': [True], 'regressor__max_depth': [10, 20], 'regressor__max_features': ['sqrt'], 'regressor__min_samples_leaf': [1, 2], ...}"
,scoring,'neg_mean_squared_error'
,n_jobs,1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,300
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [467]:
print("Optimal parameters:", grid_rf.best_params_)


Optimal parameters: {'regressor__bootstrap': True, 'regressor__max_depth': 20, 'regressor__max_features': 'sqrt', 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 300}


In [468]:
y_pred_rf_best = grid_rf.predict(X_test)


In [469]:
rmse_rf_tuned = np.sqrt(mean_squared_error(y_test, y_pred_rf_best))
mae_rf_tuned = mean_absolute_error(y_test, y_pred_rf_best)
r2_rf_tuned = r2_score(y_test, y_pred_rf_best)

In [470]:
print("Random Forest - tuned - RMSE:", rmse_rf_tuned)
print("Random Forest - tuned - MAE:", mae_rf_tuned)
print("Random Forest - tuned - R2:", r2_rf_tuned)

Random Forest - tuned - RMSE: 27399.965182365624
Random Forest - tuned - MAE: 16321.287355386632
Random Forest - tuned - R2: 0.90212175183744


Comparison between the 3 models (Linear Regression, Decision Tree, Random Forest)

In [471]:

print(" Linear Model - RMSE:", rmse_simple_imputer)

 Linear Model - RMSE: 33925.979571605385


In [472]:
#I added MAE abd R2 score for the Linear Regression model, so I can compare it to the other two; I only had the results for the RMSE 'til now.

mae_simple_imputer = mean_absolute_error(y_test, predictions)
print(" Linear Model - MAE:", mae_simple_imputer)



 Linear Model - MAE: 21538.231889169117


In [473]:

r2_simple_imputer = r2_score(y_test, predictions)
print(" Linear Model - R2:", r2_simple_imputer)

 Linear Model - R2: 0.849944831705525


In [478]:
print("Decision Tree - RMSE:", rmse_decision_tree)
print("Decision Tree - MAE:", mae_decision_tree)
print("Decision Tree - R2:", r2_decision_tree)

Decision Tree - RMSE: 33311.39541688194
Decision Tree - MAE: 21482.640410958906
Decision Tree - R2: 0.855332219954333


In [475]:
print("Decision Tree - tuned - RMSE:",rmse_decision_tree_tuned)
print("Decision Tree - tuned - MAE:",mae_decision_tree_tuned)
print("Decision Tree - tuned - R2:", r2_decision_tree_tuned)

Decision Tree - tuned - RMSE: 39485.37755365791
Decision Tree - tuned - MAE: 26115.958851228843
Decision Tree - tuned - R2: 0.7967368012205225


In [476]:
print("Random Forest - RMSE:", rmse_random_forest)
print("Random Forest - MAE:", mae_random_forest)
print("Random Forest - R2:", r2_random_forest)

Random Forest - RMSE: 27705.792144116458
Random Forest - MAE: 16296.519760273974
Random Forest - R2: 0.8999246058643854


In [477]:
print("Random Forest - tuned - RMSE:", rmse_rf_tuned)
print("Random Forest - tuned - MAE:", mae_rf_tuned)
print("Random Forest - tuned - R2:", r2_rf_tuned)

Random Forest - tuned - RMSE: 27399.965182365624
Random Forest - tuned - MAE: 16321.287355386632
Random Forest - tuned - R2: 0.90212175183744


In what accounts the Linear Regression Model (the Simple Imputer version - chosen because it performed better than the Dropna method), I can say it brought one of the worst results.

Decision Tree: the hypertuned results were worse than the initial ones before hypertuning. RMSE and MAE increased and R2 decreased, it could suggest overfitting or that the hyperparameters were not the best choices, this could make sense as I reduced some values so the model could run in a reasonable amount of time and not hours, for example. The tuned version of the Decision Tree fared worse than the Linear Regression.

Random Forest: the initial results and the ones after the hypertuning the Random Forest model were pretty similar, but if we were to choose, the tuned one is the best model of the two and even compared to the other models - Linear Regression, Decision Tree.