In [96]:
# Import all the tools we need

# Regular EDA and plotting lib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Model from sklearn 
from sklearn.ensemble import RandomForestRegressor

# Model Evalutions
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

## Housing dataset

1. Get the data ready

In [69]:
housing = pd.read_csv("housing.csv")
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [70]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [71]:
# Check is any null
housing.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [72]:
housing.dropna(inplace=True)

In [73]:
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [74]:
corr_matrix=housing.corr()

In [75]:
corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value    1.000000
median_income         0.688355
total_rooms           0.133294
housing_median_age    0.106432
households            0.064894
total_bedrooms        0.049686
population           -0.025300
longitude            -0.045398
latitude             -0.144638
Name: median_house_value, dtype: float64

In [76]:
housing.isna().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [77]:
housing["rooms_per_household"]=housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"]=housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

In [78]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467


In [79]:
housing.isna().sum()

longitude                   0
latitude                    0
housing_median_age          0
total_rooms                 0
total_bedrooms              0
population                  0
households                  0
median_income               0
median_house_value          0
ocean_proximity             0
rooms_per_household         0
bedrooms_per_room           0
population_per_household    0
dtype: int64

In [80]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20433 entries, 0 to 20639
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   longitude                 20433 non-null  float64
 1   latitude                  20433 non-null  float64
 2   housing_median_age        20433 non-null  float64
 3   total_rooms               20433 non-null  float64
 4   total_bedrooms            20433 non-null  float64
 5   population                20433 non-null  float64
 6   households                20433 non-null  float64
 7   median_income             20433 non-null  float64
 8   median_house_value        20433 non-null  float64
 9   ocean_proximity           20433 non-null  object 
 10  rooms_per_household       20433 non-null  float64
 11  bedrooms_per_room         20433 non-null  float64
 12  population_per_household  20433 non-null  float64
dtypes: float64(12), object(1)
memory usage: 2.2+ MB


In [86]:
# Split into X & Y
x = housing.drop("median_house_value",axis=1)
y = housing["median_house_value"]

In [87]:
categorical_feature = ["ocean_proximity"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                  one_hot,
                                  categorical_feature)],
                                 remainder="passthrough")

transformed_x = transformer.fit_transform(x)

In [89]:
pd.DataFrame(transformed_x)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,0.0,0.0,1.0,0.0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,6.984127,0.146591,2.555556
1,0.0,0.0,0.0,1.0,0.0,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,6.238137,0.155797,2.109842
2,0.0,0.0,0.0,1.0,0.0,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,8.288136,0.129516,2.802260
3,0.0,0.0,0.0,1.0,0.0,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,5.817352,0.184458,2.547945
4,0.0,0.0,0.0,1.0,0.0,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,6.281853,0.172096,2.181467
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20428,0.0,1.0,0.0,0.0,0.0,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,5.045455,0.224625,2.560606
20429,0.0,1.0,0.0,0.0,0.0,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,6.114035,0.215208,3.122807
20430,0.0,1.0,0.0,0.0,0.0,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,5.205543,0.215173,2.325635
20431,0.0,1.0,0.0,0.0,0.0,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,5.329513,0.219892,2.123209


In [98]:
from sklearn.model_selection import train_test_split


#split the data into test and train
np.random.seed(42)
x_train, x_test, y_train, y_test = train_test_split(transformed_x,
                                                    y,
                                                    test_size=0.2)

In [99]:
model = RandomForestRegressor()

# Make sure to use transformed (filled and one-hot encoded X data)
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.8114076093247061

In [100]:
y_preds = model.predict(x_test)
mea = mean_absolute_error(y_test,y_preds)
mea

32829.54830437974

In [101]:
df = pd.DataFrame(data={"actual values": y_test,
                        "predicted values": y_preds})
df["differences"] = df["predicted values"] - df["actual values"]
df

Unnamed: 0,actual values,predicted values,differences
14416,245800.0,240034.05,-5765.95
16383,137900.0,149494.00,11594.00
7731,218200.0,196699.00,-21501.00
1410,220800.0,126707.00,-94093.00
1335,170500.0,168727.00,-1773.00
...,...,...,...
8291,500001.0,403869.15,-96131.85
6274,157900.0,163806.00,5906.00
2997,100200.0,86201.00,-13999.00
13440,127700.0,108463.00,-19237.00


In [102]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

print("Regression model metrics on the test set")
print(f"R^2: {r2_score(y_test, y_preds)}")
print(f"MAE: {mean_absolute_error(y_test, y_preds)}")
print(f"MSE: {mean_squared_error(y_test, y_preds)}")

Regression model metrics on the test set
R^2: 0.8114076093247061
MAE: 32829.54830437974
MSE: 2579035037.3577685


In [112]:
gs_grid={'n_estimators':[10,100,1000],
         'max_depth':[None,5],
         'bootstrap':[False],
         'max_features':[2,3,4],
         'min_samples_split':[2,4] }

In [113]:
clf = RandomForestRegressor()

gs_clf = GridSearchCV(estimator=clf,
                      param_grid=gs_grid,
                      cv=5,
                      verbose=2)
gs_clf.fit(x_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] bootstrap=False, max_depth=None, max_features=2, min_samples_split=2, n_estimators=10 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  bootstrap=False, max_depth=None, max_features=2, min_samples_split=2, n_estimators=10, total=   0.4s
[CV] bootstrap=False, max_depth=None, max_features=2, min_samples_split=2, n_estimators=10 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV]  bootstrap=False, max_depth=None, max_features=2, min_samples_split=2, n_estimators=10, total=   0.4s
[CV] bootstrap=False, max_depth=None, max_features=2, min_samples_split=2, n_estimators=10 
[CV]  bootstrap=False, max_depth=None, max_features=2, min_samples_split=2, n_estimators=10, total=   0.4s
[CV] bootstrap=False, max_depth=None, max_features=2, min_samples_split=2, n_estimators=10 
[CV]  bootstrap=False, max_depth=None, max_features=2, min_samples_split=2, n_estimators=10, total=   0.4s
[CV] bootstrap=False, max_depth=None, max_features=2, min_samples_split=2, n_estimators=10 
[CV]  bootstrap=False, max_depth=None, max_features=2, min_samples_split=2, n_estimators=10, total=   0.4s
[CV] bootstrap=False, max_depth=None, max_features=2, min_samples_split=2, n_estimators=100 
[CV]  bootstrap=False, max_depth=None, max_features=2, min_samples_split=2, n_estimators=100, total=   3.5s
[CV] bootstrap=False, max_depth=None, max_features=2, min_samples_split=2, n_estimators=100 
[C

[CV]  bootstrap=False, max_depth=None, max_features=3, min_samples_split=2, n_estimators=1000, total=  44.3s
[CV] bootstrap=False, max_depth=None, max_features=3, min_samples_split=2, n_estimators=1000 
[CV]  bootstrap=False, max_depth=None, max_features=3, min_samples_split=2, n_estimators=1000, total=  44.5s
[CV] bootstrap=False, max_depth=None, max_features=3, min_samples_split=2, n_estimators=1000 
[CV]  bootstrap=False, max_depth=None, max_features=3, min_samples_split=2, n_estimators=1000, total=  44.4s
[CV] bootstrap=False, max_depth=None, max_features=3, min_samples_split=4, n_estimators=10 
[CV]  bootstrap=False, max_depth=None, max_features=3, min_samples_split=4, n_estimators=10, total=   0.4s
[CV] bootstrap=False, max_depth=None, max_features=3, min_samples_split=4, n_estimators=10 
[CV]  bootstrap=False, max_depth=None, max_features=3, min_samples_split=4, n_estimators=10, total=   0.4s
[CV] bootstrap=False, max_depth=None, max_features=3, min_samples_split=4, n_estimators

[CV]  bootstrap=False, max_depth=None, max_features=4, min_samples_split=4, n_estimators=100, total=   4.9s
[CV] bootstrap=False, max_depth=None, max_features=4, min_samples_split=4, n_estimators=100 
[CV]  bootstrap=False, max_depth=None, max_features=4, min_samples_split=4, n_estimators=100, total=   4.9s
[CV] bootstrap=False, max_depth=None, max_features=4, min_samples_split=4, n_estimators=1000 
[CV]  bootstrap=False, max_depth=None, max_features=4, min_samples_split=4, n_estimators=1000, total=  48.6s
[CV] bootstrap=False, max_depth=None, max_features=4, min_samples_split=4, n_estimators=1000 
[CV]  bootstrap=False, max_depth=None, max_features=4, min_samples_split=4, n_estimators=1000, total=  48.6s
[CV] bootstrap=False, max_depth=None, max_features=4, min_samples_split=4, n_estimators=1000 
[CV]  bootstrap=False, max_depth=None, max_features=4, min_samples_split=4, n_estimators=1000, total=  49.4s
[CV] bootstrap=False, max_depth=None, max_features=4, min_samples_split=4, n_estim

[CV]  bootstrap=False, max_depth=5, max_features=3, min_samples_split=2, n_estimators=100, total=   1.6s
[CV] bootstrap=False, max_depth=5, max_features=3, min_samples_split=2, n_estimators=100 
[CV]  bootstrap=False, max_depth=5, max_features=3, min_samples_split=2, n_estimators=100, total=   1.6s
[CV] bootstrap=False, max_depth=5, max_features=3, min_samples_split=2, n_estimators=100 
[CV]  bootstrap=False, max_depth=5, max_features=3, min_samples_split=2, n_estimators=100, total=   1.6s
[CV] bootstrap=False, max_depth=5, max_features=3, min_samples_split=2, n_estimators=100 
[CV]  bootstrap=False, max_depth=5, max_features=3, min_samples_split=2, n_estimators=100, total=   1.3s
[CV] bootstrap=False, max_depth=5, max_features=3, min_samples_split=2, n_estimators=100 
[CV]  bootstrap=False, max_depth=5, max_features=3, min_samples_split=2, n_estimators=100, total=   1.3s
[CV] bootstrap=False, max_depth=5, max_features=3, min_samples_split=2, n_estimators=1000 
[CV]  bootstrap=False, m

[CV]  bootstrap=False, max_depth=5, max_features=4, min_samples_split=4, n_estimators=10, total=   0.2s
[CV] bootstrap=False, max_depth=5, max_features=4, min_samples_split=4, n_estimators=10 
[CV]  bootstrap=False, max_depth=5, max_features=4, min_samples_split=4, n_estimators=10, total=   0.2s
[CV] bootstrap=False, max_depth=5, max_features=4, min_samples_split=4, n_estimators=100 
[CV]  bootstrap=False, max_depth=5, max_features=4, min_samples_split=4, n_estimators=100, total=   1.6s
[CV] bootstrap=False, max_depth=5, max_features=4, min_samples_split=4, n_estimators=100 
[CV]  bootstrap=False, max_depth=5, max_features=4, min_samples_split=4, n_estimators=100, total=   1.6s
[CV] bootstrap=False, max_depth=5, max_features=4, min_samples_split=4, n_estimators=100 
[CV]  bootstrap=False, max_depth=5, max_features=4, min_samples_split=4, n_estimators=100, total=   1.6s
[CV] bootstrap=False, max_depth=5, max_features=4, min_samples_split=4, n_estimators=100 
[CV]  bootstrap=False, max_d

[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed: 31.2min finished


GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'bootstrap': [False], 'max_depth': [None, 5],
                         'max_features': [2, 3, 4], 'min_samples_split': [2, 4],
                         'n_estimators': [10, 100, 1000]},
             verbose=2)

In [114]:
gs_clf.best_params_

{'bootstrap': False,
 'max_depth': None,
 'max_features': 4,
 'min_samples_split': 2,
 'n_estimators': 1000}

In [115]:
gs_clf.score(x_test, y_test)

0.8303242009301568

In [117]:
import pickle

# Save an extisting model to file
pickle.dump(gs_clf, open("housing_random_forest_model.pkl", "wb"))

In [118]:
# Load a saved model
loaded_pickle_model = pickle.load(open("housing_random_forest_model.pkl", "rb"))