In [17]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import joblib

In [18]:
# Using baseestimator for parameters tuning and transformer mixin for fit and transform for modularity
class QuantileClipper(BaseEstimator, TransformerMixin):
    def __init__(self, low = 0.01, high = 0.99):
        
        self.low = low
        self.high = high
        self.q_high = None
        self.q_low = None
        self.columns = None
        
    def fit(self, X, y = None):
        df = pd.DataFrame(X).copy()
        self.columns = df.columns
        self.q_high = df.quantile(self.high)
        self.q_low = df.quantile(self.low)
        return self
    
    def transform(self, X):
        df = pd.DataFrame(X, columns=self.columns).copy()
        df = df.clip(self.q_low,self.q_high, axis = 1)
        return df.values

In [19]:
df = pd.read_csv('./data/eda_ready_boston_house_prediction.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
0,0,-0.486575,0.289983,-1.294952,0,0.538,0.424404,-0.120442,0.151909,-0.982843,-0.66693,-1.465973,-1.083791,24.0
1,1,-0.484591,-0.489582,-0.595203,0,0.469,0.197414,0.367109,0.579619,-0.867883,-0.989077,-0.304252,-0.495454,21.6
2,2,-0.484594,-0.489582,-0.595203,0,0.469,1.323519,-0.266352,0.579619,-0.867883,-0.989077,-0.304252,-1.218147,34.7
3,3,-0.483859,-0.489582,-1.314063,0,0.458,1.047889,-0.810843,1.113635,-0.752922,-1.10839,0.113968,-1.372303,33.4
4,4,-0.478553,-0.489582,-1.314063,0,0.458,1.267508,-0.511907,1.113635,-0.752922,-1.10839,0.113968,-1.034292,36.2


In [20]:
df.columns

Index(['Unnamed: 0', 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS',
       'RAD', 'TAX', 'PTRATIO', 'LSTAT', 'MEDV'],
      dtype='object')

In [21]:
df.drop(['Unnamed: 0'], axis = 1, inplace=True)
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
0,-0.486575,0.289983,-1.294952,0,0.538,0.424404,-0.120442,0.151909,-0.982843,-0.66693,-1.465973,-1.083791,24.0
1,-0.484591,-0.489582,-0.595203,0,0.469,0.197414,0.367109,0.579619,-0.867883,-0.989077,-0.304252,-0.495454,21.6
2,-0.484594,-0.489582,-0.595203,0,0.469,1.323519,-0.266352,0.579619,-0.867883,-0.989077,-0.304252,-1.218147,34.7
3,-0.483859,-0.489582,-1.314063,0,0.458,1.047889,-0.810843,1.113635,-0.752922,-1.10839,0.113968,-1.372303,33.4
4,-0.478553,-0.489582,-1.314063,0,0.458,1.267508,-0.511907,1.113635,-0.752922,-1.10839,0.113968,-1.034292,36.2


In [22]:
# train-test-split
X = df.drop(['MEDV'], axis = 1)
y = df['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [23]:
# importing rf model
rf = RandomForestRegressor(
    n_estimators= 100,
    random_state=42,
    )

In [24]:
pipe = Pipeline(steps=[
    ('winsor', QuantileClipper()),
    ('scale', StandardScaler()),
    ('model', rf),
])

In [25]:
pipe.fit(X_train, y_train)

In [26]:
y_pred = pipe.predict(X_test)
print('Random Forest Regressor')
print(f'RMSE: {root_mean_squared_error(y_test, y_pred)}\nMSE: {mean_squared_error(y_test, y_pred)}\nMAE: {mean_absolute_error(y_test, y_pred)}')
print("r2 score for rf: ",r2_score(y_test, y_pred))
print('|----------------------------------------------|')


Random Forest Regressor
RMSE: 3.1214967890336474
MSE: 9.74374220394737
MAE: 2.082072368421053
r2 score for rf:  0.8692344999266243
|----------------------------------------------|


In [27]:
df.describe

<bound method NDFrame.describe of          CRIM        ZN     INDUS  CHAS    NOX        RM       AGE       DIS  \
0   -0.486575  0.289983 -1.294952     0  0.538  0.424404 -0.120442  0.151909   
1   -0.484591 -0.489582 -0.595203     0  0.469  0.197414  0.367109  0.579619   
2   -0.484594 -0.489582 -0.595203     0  0.469  1.323519 -0.266352  0.579619   
3   -0.483859 -0.489582 -1.314063     0  0.458  1.047889 -0.810843  1.113635   
4   -0.478553 -0.489582 -1.314063     0  0.458  1.267508 -0.511907  1.113635   
..        ...       ...       ...   ...    ...       ...       ...       ...   
501 -0.479482 -0.489582  0.119246     0  0.573  0.450935  0.018349 -0.633876   
502 -0.481993 -0.489582  0.119246     0  0.573 -0.246247  0.288816 -0.727064   
503 -0.479752 -0.489582  0.119246     0  0.573  1.015461  0.797720 -0.785581   
504 -0.472688 -0.489582  0.119246     0  0.573  0.747201  0.737220 -0.677617   
505 -0.481683 -0.489582  0.119246     0  0.573 -0.378904  0.434725 -0.621002   

     

In [28]:
new_data_df = pd.DataFrame([{
    "CRIM": 0.06905,
    "ZN": 0.0,
    "INDUS": 2.18,
    "CHAS": 0,
    "NOX": 0.458,
    "RM": 7.147,
    "AGE": 54.2,
    "DIS": 6.0622,
    "RAD": 3,
    "TAX": 222.0,
    "PTRATIO": 18.7,
    "LSTAT": 5.33
}])

In [29]:
import warnings
warnings.filterwarnings("ignore")

In [30]:
# checking on new dataset
y_new = pipe.predict(new_data_df)[0]
print("Predicted MEDV ($1000s):", y_new)

Predicted MEDV ($1000s): 25.109000000000016


In [31]:
# I have tried predicting for the 4th sample
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
0,-0.486575,0.289983,-1.294952,0,0.538,0.424404,-0.120442,0.151909,-0.982843,-0.66693,-1.465973,-1.083791,24.0
1,-0.484591,-0.489582,-0.595203,0,0.469,0.197414,0.367109,0.579619,-0.867883,-0.989077,-0.304252,-0.495454,21.6
2,-0.484594,-0.489582,-0.595203,0,0.469,1.323519,-0.266352,0.579619,-0.867883,-0.989077,-0.304252,-1.218147,34.7
3,-0.483859,-0.489582,-1.314063,0,0.458,1.047889,-0.810843,1.113635,-0.752922,-1.10839,0.113968,-1.372303,33.4
4,-0.478553,-0.489582,-1.314063,0,0.458,1.267508,-0.511907,1.113635,-0.752922,-1.10839,0.113968,-1.034292,36.2


In [32]:
# exporting the pipeline
import os

file_name = 'bhp_pipeline.joblib'

if os.path.exists(file_name):
    print('File already exists')
else:
    joblib.dump(pipe, 'bhp_pipeline.joblib')
    
# later we import this pipeline as
# pipe = joblib.load('bhp_pipeline.joblib')

File already exists
