In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, mean_squared_error, r2_score
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import StandardScaler


In [2]:
df = pd.read_csv('./data/eda_ready_boston_house_prediction.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
0,0,-0.486575,0.289983,-1.294952,0,0.538,0.424404,-0.120442,0.151909,-0.982843,-0.66693,-1.465973,-1.083791,24.0
1,1,-0.484591,-0.489582,-0.595203,0,0.469,0.197414,0.367109,0.579619,-0.867883,-0.989077,-0.304252,-0.495454,21.6
2,2,-0.484594,-0.489582,-0.595203,0,0.469,1.323519,-0.266352,0.579619,-0.867883,-0.989077,-0.304252,-1.218147,34.7
3,3,-0.483859,-0.489582,-1.314063,0,0.458,1.047889,-0.810843,1.113635,-0.752922,-1.10839,0.113968,-1.372303,33.4
4,4,-0.478553,-0.489582,-1.314063,0,0.458,1.267508,-0.511907,1.113635,-0.752922,-1.10839,0.113968,-1.034292,36.2


In [3]:
df.columns

Index(['Unnamed: 0', 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS',
       'RAD', 'TAX', 'PTRATIO', 'LSTAT', 'MEDV'],
      dtype='object')

In [4]:
df.drop(['Unnamed: 0'], axis = 1, inplace=True)
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
0,-0.486575,0.289983,-1.294952,0,0.538,0.424404,-0.120442,0.151909,-0.982843,-0.66693,-1.465973,-1.083791,24.0
1,-0.484591,-0.489582,-0.595203,0,0.469,0.197414,0.367109,0.579619,-0.867883,-0.989077,-0.304252,-0.495454,21.6
2,-0.484594,-0.489582,-0.595203,0,0.469,1.323519,-0.266352,0.579619,-0.867883,-0.989077,-0.304252,-1.218147,34.7
3,-0.483859,-0.489582,-1.314063,0,0.458,1.047889,-0.810843,1.113635,-0.752922,-1.10839,0.113968,-1.372303,33.4
4,-0.478553,-0.489582,-1.314063,0,0.458,1.267508,-0.511907,1.113635,-0.752922,-1.10839,0.113968,-1.034292,36.2


In [5]:
# train-test-split
X = df.drop(['MEDV'], axis = 1)
y = df['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [6]:
# importing models

lin_reg = LinearRegression()
rf = RandomForestRegressor(max_depth=100)

lin_reg.fit(X_train, y_train)
rf.fit(X_train, y_train)

In [7]:
y_pred_lin_reg = lin_reg.predict(X_test) 
y_pred_rf = rf.predict(X_test)

In [8]:
print('Linear Regression')
print('|----------------------------------------------|')
print(f'RMSE: {root_mean_squared_error(y_test, y_pred_lin_reg)}\nMSE: {mean_squared_error(y_test, y_pred_lin_reg)}\nMAE: {mean_absolute_error(y_test, y_pred_lin_reg)}')
print('----------------------------------------------')
print('Random Forest Regressor')
print(f'RMSE: {root_mean_squared_error(y_test, y_pred_rf)}\nMSE: {mean_squared_error(y_test, y_pred_rf)}\nMAE: {mean_absolute_error(y_test, y_pred_rf)}')
print('|----------------------------------------------|')

Linear Regression
|----------------------------------------------|
RMSE: 4.563891008481587
MSE: 20.829101137299077
MAE: 3.159062104399312
----------------------------------------------
Random Forest Regressor
RMSE: 3.1317759475298224
MSE: 9.808020585526316
MAE: 2.0588092105263165
|----------------------------------------------|


In [9]:
print("r2 score for lr: ",r2_score(y_test, y_pred_lin_reg))
print("r2 score for rf: ",r2_score(y_test, y_pred_rf))

r2 score for lr:  0.7204638865348469
r2 score for rf:  0.8683718544937769


In [10]:
df.describe

<bound method NDFrame.describe of          CRIM        ZN     INDUS  CHAS    NOX        RM       AGE       DIS  \
0   -0.486575  0.289983 -1.294952     0  0.538  0.424404 -0.120442  0.151909   
1   -0.484591 -0.489582 -0.595203     0  0.469  0.197414  0.367109  0.579619   
2   -0.484594 -0.489582 -0.595203     0  0.469  1.323519 -0.266352  0.579619   
3   -0.483859 -0.489582 -1.314063     0  0.458  1.047889 -0.810843  1.113635   
4   -0.478553 -0.489582 -1.314063     0  0.458  1.267508 -0.511907  1.113635   
..        ...       ...       ...   ...    ...       ...       ...       ...   
501 -0.479482 -0.489582  0.119246     0  0.573  0.450935  0.018349 -0.633876   
502 -0.481993 -0.489582  0.119246     0  0.573 -0.246247  0.288816 -0.727064   
503 -0.479752 -0.489582  0.119246     0  0.573  1.015461  0.797720 -0.785581   
504 -0.472688 -0.489582  0.119246     0  0.573  0.747201  0.737220 -0.677617   
505 -0.481683 -0.489582  0.119246     0  0.573 -0.378904  0.434725 -0.621002   

     

In [11]:
new_data_df = pd.DataFrame([{
    "CRIM": 0.06905,
    "ZN": 0.0,
    "INDUS": 2.18,
    "CHAS": 0,
    "NOX": 0.458,
    "RM": 7.147,
    "AGE": 54.2,
    "DIS": 6.0622,
    "RAD": 3,
    "TAX": 222.0,
    "PTRATIO": 18.7,
    "LSTAT": 5.33
}])

In [12]:
# applying winsorization and scaling on new dataset
new_data_df_wins = new_data_df.copy()
for col in new_data_df_wins.columns:
    new_data_df_wins[col] = winsorize(new_data_df_wins[col], limits=[0.01, 0.01])
        
new_data_df_wins

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT
0,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,5.33


In [13]:
# scaling with the SAME scaler I used on training data
cols_to_scale = ['CRIM','ZN','INDUS', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'LSTAT']
scaler = StandardScaler()
new_data_df_wins[cols_to_scale] = scaler.fit_transform(new_data_df_wins[cols_to_scale])
new_data_df_wins

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT
0,0.0,0.0,0.0,0,0.458,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
import warnings
warnings.filterwarnings("ignore")

In [15]:
# checking the data prediction based on data from the dataset for now
y_pred_lr = lin_reg.predict(new_data_df_wins)
y_pred_rf = rf.predict(new_data_df_wins)
print("Predicted price (LR):", y_pred_lr[0])
print("Predicted price (RF):", y_pred_rf[0])

Predicted price (LR): 23.850818921221727
Predicted price (RF): 21.872999999999998


In [16]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
0,-0.486575,0.289983,-1.294952,0,0.538,0.424404,-0.120442,0.151909,-0.982843,-0.66693,-1.465973,-1.083791,24.0
1,-0.484591,-0.489582,-0.595203,0,0.469,0.197414,0.367109,0.579619,-0.867883,-0.989077,-0.304252,-0.495454,21.6
2,-0.484594,-0.489582,-0.595203,0,0.469,1.323519,-0.266352,0.579619,-0.867883,-0.989077,-0.304252,-1.218147,34.7
3,-0.483859,-0.489582,-1.314063,0,0.458,1.047889,-0.810843,1.113635,-0.752922,-1.10839,0.113968,-1.372303,33.4
4,-0.478553,-0.489582,-1.314063,0,0.458,1.267508,-0.511907,1.113635,-0.752922,-1.10839,0.113968,-1.034292,36.2
