In [1]:
# Importing necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer,mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor,BaggingRegressor
from sklearn.linear_model import LinearRegression
import math
from mlxtend.regressor import StackingRegressor

In [2]:
#Loading the house prices dataset for the mini-challenge
df = pd.read_csv('../data/house_prices.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0,SC60,RL,65,8450,Pave,,Reg,Lvl,AllPub,...,0,No,No,No,0,Feb,2008,WD,Normal,208500
1,1,SC20,RL,80,9600,Pave,,Reg,Lvl,AllPub,...,0,No,No,No,0,May,2007,WD,Normal,181500
2,2,SC60,RL,68,11250,Pave,,IR1,Lvl,AllPub,...,0,No,No,No,0,Sep,2008,WD,Normal,223500
3,3,SC70,RL,60,9550,Pave,,IR1,Lvl,AllPub,...,0,No,No,No,0,Feb,2006,WD,Abnorml,140000
4,4,SC60,RL,84,14260,Pave,,IR1,Lvl,AllPub,...,0,No,No,No,0,Dec,2008,WD,Normal,250000


# Random Forest Solutions

<img src="../images/icon/ppt-icons.png" alt="Concept-Alert" style="width: 100px;float:left; margin-right:15px"/>
<br /> 

##  Mini-Challenge - 1
LabelEncode the categorical features.

In [3]:
categorical_features = list(set(df.columns)-set(df._get_numeric_data().columns))
label_enc = LabelEncoder()
for feature in categorical_features:
    df[feature] = label_enc.fit_transform(df[feature].astype(str))
categorical_features = list(set(df.columns)-set(df._get_numeric_data().columns))

<img src="../images/icon/ppt-icons.png" alt="Concept-Alert" style="width: 100px;float:left; margin-right:15px"/>
<br /> 

##  Mini-Challenge - 2
Split the features and target variable and then split the data into train and test.
The 'SalePrice' feature is the target variable.

In [4]:
X = df.drop('SalePrice',axis=1)
y = df['SalePrice']
X_train,X_test,y_train,y_test = train_test_split(X,y)

<img src="../images/icon/ppt-icons.png" alt="Concept-Alert" style="width: 100px;float:left; margin-right:15px"/>
<br /> 

##  Mini-Challenge - 3
Fit a RandomForest Regressor and tune the parameters using GridSearchCV.
Search for the best set of features amongst the following list of features and their value.
'n_estimators': [10,20,30],
'max_depth': [6,8],
'min_samples_split': [10,15],

In [5]:
#RandomForestRegressor tuned using GridSearchCV
regressor = RandomForestRegressor(random_state=9)
params = {
            'n_estimators': [10,20,30],
            'max_depth': [6,8],
            'min_samples_split': [10,15]
             }

search = GridSearchCV(regressor,param_grid=params,cv=3)
search.fit(X_train,y_train)

GridSearchCV(cv=3, estimator=RandomForestRegressor(random_state=9),
             param_grid={'max_depth': [6, 8], 'min_samples_split': [10, 15],
                         'n_estimators': [10, 20, 30]})

<img src="../images/icon/ppt-icons.png" alt="Concept-Alert" style="width: 100px;float:left; margin-right:15px"/>
<br /> 

##  Mini-Challenge - 4
Predict on the test data and calculate the mean-squared-error of your model.

In [6]:
y_pred = search.predict(X_test)
mean_squared_error(y_test,y_pred)

1036901173.5468004

# Ensemble Methods Solutions

In [7]:
#Loading the house prices dataset for the mini-challenge
df = pd.read_csv('../data/house_prices.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0,SC60,RL,65,8450,Pave,,Reg,Lvl,AllPub,...,0,No,No,No,0,Feb,2008,WD,Normal,208500
1,1,SC20,RL,80,9600,Pave,,Reg,Lvl,AllPub,...,0,No,No,No,0,May,2007,WD,Normal,181500
2,2,SC60,RL,68,11250,Pave,,IR1,Lvl,AllPub,...,0,No,No,No,0,Sep,2008,WD,Normal,223500
3,3,SC70,RL,60,9550,Pave,,IR1,Lvl,AllPub,...,0,No,No,No,0,Feb,2006,WD,Abnorml,140000
4,4,SC60,RL,84,14260,Pave,,IR1,Lvl,AllPub,...,0,No,No,No,0,Dec,2008,WD,Normal,250000


<img src="../images/icon/ppt-icons.png" alt="Concept-Alert" style="width: 100px;float:left; margin-right:15px"/>
<br /> 

##  Mini-Challenge - 1
LabelEncode the categorical features and split the dataset into features and target variable('SalePrice') then split it into train and test.

In [8]:
categorical_features = list(set(df.columns)-set(df._get_numeric_data().columns))
label_enc = LabelEncoder()
for feature in categorical_features:
    df[feature] = label_enc.fit_transform(df[feature].astype(str))
categorical_features = list(set(df.columns)-set(df._get_numeric_data().columns))
X = df.drop('SalePrice',axis=1)
y = df['SalePrice']
X_train,X_test,y_train,y_test = train_test_split(X,y)

<img src="../images/icon/ppt-icons.png" alt="Concept-Alert" style="width: 100px;float:left; margin-right:15px"/>
<br /> 

##  Mini-Challenge - 2
Fit a GradientBoosting Regressor on the above model and predict on the test data and calculate the mean-squared-error.

In [9]:
gb_regressor = GradientBoostingRegressor()
gb_regressor.fit(X_train,y_train)
y_pred_gb = gb_regressor.predict(X_test)
mean_squared_error(y_test,y_pred_gb)

2071415212.3843338

<img src="../images/icon/ppt-icons.png" alt="Concept-Alert" style="width: 100px;float:left; margin-right:15px"/>
<br /> 

##  Mini-Challenge - 3
Fit a Bagging  model on the above data and predict on the test data and calculate the mean-squared-error.

In [10]:
bag_regressor = BaggingRegressor()
bag_regressor.fit(X_train,y_train)
y_pred_bag = bag_regressor.predict(X_test)
mean_squared_error(y_test,y_pred_bag)

1902049915.4229045

<img src="../images/icon/ppt-icons.png" alt="Concept-Alert" style="width: 100px;float:left; margin-right:15px"/>
<br /> 

##  Mini-Challenge - 4
Stack the above models and predict on the test data.

In [11]:
lin_reg = LinearRegression()
regression_models = [bag_regressor,gb_regressor]
stacking_regressor = StackingRegressor(regressors=regression_models,meta_regressor=lin_reg)
stacking_regressor.fit(X_train, y_train)
y_pred_stack = stacking_regressor.predict(X_test)
mean_squared_error(y_test,y_pred_stack)

1903716252.5203714