## Data reading and Cleaning

In [185]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from scipy.stats import randint
import seaborn as sns
from sklearn.feature_selection import chi2, VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder , LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA
# reading
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/train.csv") 

In [186]:
data.head()
#data = data.set_index('Id')

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [None]:
# Analysing if there is any null values

pd.options.display.max_rows=None ## shows all the rows
data.isnull().sum()

In [188]:
## Percentage of missing values in Alley,PoolQC,Fence,MiscFeature
 
P1 =data[['Alley','PoolQC','Fence','MiscFeature','FireplaceQu']]
per =P1.isnull().sum()/len(P1)*100
per

Alley          93.767123
PoolQC         99.520548
Fence          80.753425
MiscFeature    96.301370
FireplaceQu    47.260274
dtype: float64

In [189]:
#Percentage of missing values is more then 80%

data=data.drop(['Alley','PoolQC','Fence','MiscFeature', 'Id'], axis = 1)
data.shape

(1460, 76)

In [None]:
data['MSZoning'].apply(str)

In [191]:
#sns.distplot(data.GarageYrBlt)
#sns.distplot(data.LotFrontage)
#plt.show()

In [192]:
#y = data['SalePrice'].copy()
#X = data.drop(columns=['SalePrice']).copy()

In [193]:
X= data
y= X.pop('SalePrice')

PreProcessing

In [194]:
X_num_col = X.select_dtypes(include="number").copy().columns
X_cat_col = X.select_dtypes(exclude="number").copy().columns

In [195]:
num_pipe = make_pipeline(
    SimpleImputer(strategy="median"),
    MinMaxScaler())
    #StandardScaler())

cat_pipe = make_pipeline(
    #SimpleImputer(strategy="constant", fill_value='NaN'),
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(drop="first", handle_unknown="ignore", sparse=False))
    #OneHotEncoder(drop="first"))    

In [196]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", num_pipe, X_num_col),
        ("cat_pipe", cat_pipe, X_cat_col)])

In [197]:
X_new = preprocessor.fit_transform(X)

In [198]:
X_new.shape

(1460, 236)

In [199]:
X_df = pd.DataFrame(X_new)
X_df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,226,227,228,229,230,231,232,233,234,235
0,0.235294,0.150685,0.03342,0.666667,0.5,0.949275,0.883333,0.1225,0.125089,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.202055,0.038795,0.555556,0.875,0.753623,0.433333,0.0,0.173281,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.235294,0.160959,0.046507,0.666667,0.5,0.934783,0.866667,0.10125,0.086109,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.294118,0.133562,0.038561,0.666667,0.5,0.311594,0.333333,0.0,0.038271,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.235294,0.215753,0.060576,0.777778,0.5,0.927536,0.833333,0.21875,0.116052,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [200]:
my_scaler = MinMaxScaler()

X_scaled = my_scaler.fit_transform(X_df)

X_scaled = pd.DataFrame(X_scaled, columns=X_df.columns)
X_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,226,227,228,229,230,231,232,233,234,235
0,0.235294,0.150685,0.03342,0.666667,0.5,0.949275,0.883333,0.1225,0.125089,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.202055,0.038795,0.555556,0.875,0.753623,0.433333,0.0,0.173281,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.235294,0.160959,0.046507,0.666667,0.5,0.934783,0.866667,0.10125,0.086109,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.294118,0.133562,0.038561,0.666667,0.5,0.311594,0.333333,0.0,0.038271,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.235294,0.215753,0.060576,0.777778,0.5,0.927536,0.833333,0.21875,0.116052,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [None]:
X_scaled.var().sort_values()

In [218]:
VAR= VarianceThreshold(threshold = 0.001)  #Removing both constant and quasi-constant
VAR.fit(X_scaled)                      # dropping coloumn based on the low variance(threshold)

VAR.get_support()
#array([False,  True,  True,  True,  True,  True,  True,  True, False])
X_var = VAR.fit_transform(X_scaled)
X_var.shape

(1460, 220)

In [220]:
X_var_scaled = pd.DataFrame(X_var)

In [221]:
print("shape before:", X_scaled.shape)
print("shape after:", X_var_scaled.shape)

shape before: (1460, 236)
shape after: (1460, 220)


## Spliting data

In [222]:
X_train, X_test, y_train, y_test = train_test_split(X_var_scaled, y, train_size=.8, random_state=1230000)

##Liner Regression

In [233]:
LR= LinearRegression()
LR.fit(X_train,y_train)  ## fitting the training data

X_test_pred_LR=LR.predict(X_test)  ## predicted x test

In [234]:
X_train_pred_LR=LR.predict(X_train) ##predicted x train

X_train_pred_LR

array([109608.06312597, 193995.95920187, 133190.95908146, ...,
       190143.53372109,  51180.52654893, 116924.59292536])

In [237]:
print('LR trainind score is',LR.score(X_train,y_train))
print('LR testing score is',LR.score(X_test,y_test))

LR trainind score is 0.9152740790802847
LR testing score is 0.8537847913884321


Evaluation Matrix for LR

In [240]:
from sklearn.metrics import r2_score
train_score = r2_score(y_train,X_train_pred_LR)

print('LR r2_score for training is',train_score)

LR r2_score for training is 0.9152740790802847


In [241]:
test_score = r2_score(y_test,X_test_pred_LR)

print('LR r2_score for testing is',test_score)

LR r2_score for testing is 0.8537847913884321


Training score is more than testing score so the model is underfitting.

## Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor , DecisionTreeClassifier

# initialize transformers &amp; model
dtree = DecisionTreeClassifier()

dtree=DecisionTreeRegressor()
dtree.fit(X_train,y_train)

X_test_pred_dtree = dtree.predict(X_test) ## predicted x test

X_test_pred_dtree  ## predicted x test
 

In [265]:
x_train_pred_dtree = dtree.predict(X_train)  ## predicted x train

x_train_pred_dtree

array([113000., 188000., 128200., ..., 175000.,  73000., 119000.])

In [266]:
print('Training score for Decision Tree Regressor is',dtree.score(X_train,y_train))

Training score for Decision Tree Regressor is 1.0


In [267]:
print('Testing score for Decision Tree Regressor is',dtree.score(X_test,y_test))

Testing score for Decision Tree Regressor is 0.7925835046676964


Training score is more than the Testing score,so the decision tree model is underfitting,So we can do hyper parametric tuning

In [268]:
parameters={"splitter":["best","random"],
            "max_depth" : [2,4,6,8],
           "min_samples_leaf":[1,2,3,4,5,],
           "max_features":["auto","sqrt"],
           "max_leaf_nodes":[5,10,15] }

In [269]:
hyper_tuning_dtree = RandomizedSearchCV(estimator=dtree, param_distributions = parameters,
                               cv = 2, n_iter = 10, n_jobs=-1)

In [270]:
hyper_tuning_dtree.fit(X_train, y_train)

RandomizedSearchCV(cv=2, estimator=DecisionTreeRegressor(), n_jobs=-1,
                   param_distributions={'max_depth': [2, 4, 6, 8],
                                        'max_features': ['auto', 'sqrt'],
                                        'max_leaf_nodes': [5, 10, 15],
                                        'min_samples_leaf': [1, 2, 3, 4, 5],
                                        'splitter': ['best', 'random']})

In [271]:
hyper_tuning_dtree.best_params_

{'max_depth': 4,
 'max_features': 'auto',
 'max_leaf_nodes': 10,
 'min_samples_leaf': 1,
 'splitter': 'best'}

In [273]:
hypertuning_dtree = DecisionTreeRegressor(splitter= 'best',
                                          min_samples_leaf=5,
                                          max_leaf_nodes=15,
                                          max_features='sqrt',
                                          max_depth=6)

hypertuning_dtree

DecisionTreeRegressor(max_depth=6, max_features='sqrt', max_leaf_nodes=15,
                      min_samples_leaf=5)

In [274]:
hypertuning_dtree.fit(X_train,y_train)

DecisionTreeRegressor(max_depth=6, max_features='sqrt', max_leaf_nodes=15,
                      min_samples_leaf=5)

In [None]:
dtree_hyper_pred_Xtest =hypertuning_dtree.predict(X_test)

dtree_hyper_pred_Xtest

In [278]:
dtree_hyper_pred_Xtrain =hypertuning_dtree.predict(X_train)

dtree_hyper_pred_Xtrain

array([135229.19178082, 174218.0776699 , 110288.73255814, ...,
       188349.58552632, 110288.73255814, 135229.19178082])

In [279]:
print('Training score for hyper parametric tuning of Decision tree regressor is',hypertuning_dtree.score(X_train,y_train))

Training score for hyper parametric tuning of Decision tree regressor is 0.7021017365802271


In [280]:
print('Testing score for hyper parametric tuning of Decision tree regressor is',hypertuning_dtree.score(X_test,y_test))

Testing score for hyper parametric tuning of Decision tree regressor is 0.6522797001365142


## RandomForestRegressor


In [242]:
from sklearn.ensemble import RandomForestRegressor

RF=RandomForestRegressor()
RF.fit(X_train,y_train) ## fitting the data

x_test_pred_RF = RF.predict(X_test)  ## predicted x test

In [245]:
X_train_pred_RF = LR.predict(X_train)  ## predicted x train
X_train_pred_RF

array([109608.06312597, 193995.95920187, 133190.95908146, ...,
       190143.53372109,  51180.52654893, 116924.59292536])

In [246]:
print('RF score for training is',RF.score(X_train,y_train))
print('RF score for testing is',RF.score(X_test,y_test))

RF score for training is 0.9774477392907993
RF score for testing is 0.8735962229418293


Training score is more than Testing score,so the Random Forest Regressor model is underfitting.So we can do hyper parametric tuning.

Hyper parametric Tuning -- Random Forest Regressor

In [248]:
from sklearn.model_selection import RandomizedSearchCV

In [249]:
random_grid = {'n_estimators': [100,200,300,400,500,600],  ## no. of trees
               'max_features': ['auto', 'sqrt'],  
               'max_depth': [10, 15,20,25], ## maxinum number of levels in trees 
               'min_samples_split':  [2, 5, 10], ## minimum number of samples required to split a node
               'min_samples_leaf': [1, 2, 4], ## Minimum number of samples required at each leaf node
               'bootstrap': [True, False]}  ##  Method of selecting samples for training each tree

In [250]:
hyper_tuning = RandomizedSearchCV(estimator=RF,
                                  param_distributions=random_grid,
                                  n_iter=10,
                                  cv=5,
                                  verbose=5,
                                  random_state=2)

In [None]:
hyper_tuning.fit(X_train,y_train)

In [253]:
hyper_tuning.best_params_ 

{'bootstrap': True,
 'max_depth': 20,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 5,
 'n_estimators': 100}

In [256]:
RF_hyper_tuning=RandomForestRegressor(n_estimators=100,
                                      min_samples_split=5,
                                      min_samples_leaf= 4,
                                      max_features='auto',
                                      max_depth=20,
                                      bootstrap= True)  ## implementing the best parameters

In [257]:
RF_hyper_tuning.fit(X_train,y_train)

RandomForestRegressor(max_depth=20, min_samples_leaf=4, min_samples_split=5)

In [None]:
RF_hyper_pred_test = RF_hyper_tuning.predict(X_test) ## predicted hyperparametric tuning x test

RF_hyper_pred_test

In [None]:
RF_hyper_pred_train = RF_hyper_tuning.predict(X_train)  ## predicted hyper parametric tuning x train

RF_hyper_pred_train

In [260]:
print('RF hyper Parametric Tunning score for training is ',RF_hyper_tuning.score(X_train,y_train))

RF hyper Parametric Tunning score for training is  0.9425364883735512


In [261]:
print('RF hyper Parametric Tunning score for testing is ',RF_hyper_tuning.score(X_test,y_test))

RF hyper Parametric Tunning score for testing is  0.8681653449311096


In [285]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

In [342]:
XGB=XGBRegressor()
XGB.fit(X_train,y_train) 

Xtest_XGB_pred=XGB.predict(X_test)



In [None]:
Xtest_XGB_pred

In [345]:
Xtrain_XGB_pred=XGB.predict(X_train)

In [346]:
print('Training score for XGB is',XGB.score(X_train,y_train))
print('Training score for XGB is',XGB.score(X_test,y_test))

Training score for XGB is 0.9670979276837492
Training score for XGB is 0.8979746026807861


In [337]:
# creating 5 different models
RF = RandomForestRegressor().fit(X_train, y_train)
DT = DecisionTreeRegressor().fit(X_train, y_train)
GBR = GradientBoostingRegressor().fit(X_train, y_train)
LR = LinearRegression().fit(X_train, y_train)
XGB = XGBRegressor().fit(X_train, y_train)



In [338]:
# the evaluation metrics
models = [LR, DT, RF, GBR, XGB]
RMSE = [mean_squared_error(y_test, mod.predict(X_test))**0.5 for mod in models]
MAPE = [mean_absolute_percentage_error(y_test, mod.predict(X_test)) for mod in models]
R2_Score = [r2_score(y_test, mod.predict(X_test)) for mod in models]

###  the evaluation metrics for train
models = [LR, DT, RF, GBR, XGB]

RMSE = [mean_squared_error(y_train, mod.predict(X_train))**0.5 for mod in models]

MAPE = [mean_absolute_percentage_error(y_train, mod.predict(X_train)) for mod in models]

R2_Score = [r2_score(y_train, mod.predict(X_train)) for mod in models]

	Models	RMSE	MAPE	R2_Score
0	Linear Regression	23290.665819	0.088084	**0.915274**

1	Decision Tree	0.000000	0.000000	**1.000000**

2	Random Forest	11702.259284	0.038881  **0.978611**

3	Gradient Boosting	13821.185287	0.063030	**0.970164**

4	XgBoost	14513.941936	0.064766	**0.967098**

In [339]:
# comparing 5 models
Models = ['Linear Regression','Decision Tree','Random Forest','Gradient Boosting','XgBoost']
evaluation = pd.DataFrame({'Models':Models,'RMSE':RMSE,'MAPE':MAPE, 'R2_Score':R2_Score})

In [340]:
evaluation

Unnamed: 0,Models,RMSE,MAPE,R2_Score
0,Linear Regression,29389.952749,0.107984,0.853785
1,Decision Tree,37135.615004,0.14565,0.76656
2,Random Forest,27388.376791,0.095856,0.873022
3,Gradient Boosting,24317.093815,0.086227,0.899904
4,XgBoost,24550.297604,0.088531,0.897975
