# MGTF 495A: Project 3
# Housing Sale Price Prediction

In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline

In [2]:
# Read data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
#check the numbers of samples and features
print("The train data size before dropping Id feature is : {} ".format(train.shape))
print("The test data size before dropping Id feature is : {} ".format(test.shape))

#Save the 'Id' column
train_ID = train['ID']
test_ID = test['ID']

#Now drop the  'Id' colum since it's unnecessary for  the prediction process.
train.drop("ID", axis = 1, inplace = True)
test.drop("ID", axis = 1, inplace = True)

#check again the data size after dropping the 'Id' variable
print("\nThe train data size after dropping Id feature is : {} ".format(train.shape)) 
print("The test data size after dropping Id feature is : {} ".format(test.shape))

The train data size before dropping Id feature is : (2051, 81) 
The test data size before dropping Id feature is : (879, 80) 

The train data size after dropping Id feature is : (2051, 80) 
The test data size after dropping Id feature is : (879, 79) 


## `SalePrice` Baseline 
### Predict the global average sale price.

In [4]:
price = train["SalePrice"]
del train["SalePrice"]

In [5]:
# get columne names for str columns and num columns

# ID not included
col = list(train)
str_col = []
num_col = []

for i in range(len(col)):
    for j in range(len(train)):
#         if train.iloc[j,i+1] is not np.nan:
        # contain letter
        if bool(re.search('[a-zA-Z]', str(train.iloc[j,i]))):
            str_col.append(col[i])
            break
        # contain digit, use int to transform float so we can use isdigit function
        if str(int(train.iloc[j,i])).isdigit():
            num_col.append(col[i])
            break 

In [6]:
from sklearn.preprocessing import StandardScaler
X_train_cat = train[str_col]
X_train_num = train[num_col]
X_test_cat = test[str_col]
X_test_num = test[num_col]

In [7]:
# get standardized numerice and character features

scaler_cat = make_pipeline(SimpleImputer(strategy='constant'), OneHotEncoder(handle_unknown='ignore'))
X_train_cat_enc = scaler_cat.fit_transform(X_train_cat)
X_test_cat_enc = scaler_cat.transform(X_test_cat)

scaler_num = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
X_train_num_scaled = scaler_num.fit_transform(X_train_num)
X_test_num_scaled = scaler_num.transform(X_test_num)

In [8]:
import numpy as np
from scipy import sparse

X_train_scaled = sparse.hstack((X_train_cat_enc,
                                sparse.csr_matrix(X_train_num_scaled))) # combine two types of features
X_test_scaled = sparse.hstack((X_test_cat_enc,
                               sparse.csr_matrix(X_test_num_scaled)))   # combine two types of features

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_train_scaled, price, test_size=0.3, random_state=5)

In [10]:
def rmse1(y_true, y_pred):
    """Root Mean Squared Error"""
    from sklearn.metrics import mean_squared_error
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [52]:
# import models we are trying to use
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.svm import SVC

models=[SVC(),RandomForestRegressor(),LinearRegression(),KNeighborsRegressor(),SVR(),Ridge(),Lasso(),MLPRegressor(alpha=20),DecisionTreeRegressor(),ExtraTreeRegressor(),RandomForestRegressor(),AdaBoostRegressor(),GradientBoostingRegressor(),BaggingRegressor()]


  and should_run_async(code)


In [82]:
# check performances of models with default parameterss
for model in models:
    model=model   
    model.fit(X_train, y_train)
    pred = model.predict(X_valid)
    s = rmse1(y_valid,pred)
    print(model,s)

  and should_run_async(code)


SVC() 68973.4127096444
RandomForestRegressor() 27522.183357087622
LinearRegression() 36057.028306169756
KNeighborsRegressor() 32187.31415866865
SVR() 90407.53851066259
Ridge() 30762.704716779754


  model = cd_fast.sparse_enet_coordinate_descent(


Lasso() 33234.03537995853




MLPRegressor(alpha=20) 215421.35205950215
DecisionTreeRegressor() 39315.24418624029
ExtraTreeRegressor() 42092.999353858075
RandomForestRegressor() 27426.05224232976
AdaBoostRegressor() 36890.72934654115
GradientBoostingRegressor() 24438.947811875038
BaggingRegressor() 28531.23080531694


In [119]:
# use GradientBoostingRegressor cause it has the lowest rmse
# find the best parameters
# parameters = { 'loss': ['ls', 'lad', 'huber', 'quantile'], 'n_estimators':[100,200,300], 'max_depth':[3,4,5], 'aplha':[0.8,0.9],}
a=pd.DataFrame(columns=['s', 'loss', 'n_estimators', 'max_depth','subsample','learning_rate'])
i=0
for loss in  ['ls', 'lad', 'huber', 'quantile']:
    for n in [100,200,300]:
        for max_d in [3,4,5]:
            for sub in [0.4, 0.7, 1]:
                for rate in [0.1, 0.2]:
                    clf = GradientBoostingRegressor(loss=loss, n_estimators=n, max_depth=max_d, subsample=sub, learning_rate=rate)
                    clf.fit(X_train, y_train)
                    pred = clf.predict(X_valid)
                    s = rmse1(y_valid,pred)
                    new = [s, loss, n, max_d,sub,rate]
                    a.loc[i]=new
                    i+=1

  and should_run_async(code)


In [120]:
a.sort_values(by="s", ascending=True).head()

  and should_run_async(code)


Unnamed: 0,s,loss,n_estimators,max_depth,subsample,alpha,learning_rate
104,22214.244084,ls,300,3,0.8,0.8,0.1
114,22521.415197,ls,300,4,0.4,0.9,0.1
106,22554.455101,ls,300,3,0.8,0.9,0.1
100,22647.450557,ls,300,3,0.6,0.8,0.1
120,22779.317273,ls,300,4,0.8,0.8,0.1


In [130]:
model= GradientBoostingRegressor(loss='ls', n_estimators=300,max_depth=3,subsample=0.4)
model.fit(X_train_scaled, price)
predicted= model.predict(X_test_scaled)

# Submission

In [131]:
# create submission file
sub = pd.DataFrame()
sub['Id'] = test_ID
sub['SalePrice'] = predicted
sub.to_csv('baseline_submission.csv',index=False)

  and should_run_async(code)
