In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

import matplotlib.pyplot as plt

from scipy import stats
import math
import random

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error

from xgboost import XGBRegressor

In [2]:
def giveMeWrangledData(fname):
    df = pd.read_csv(fname)
    
    df = df.drop(['Id', 'GarageYrBlt'],axis=1)
    
    df['LotFrontage'] =df.LotFrontage.fillna(df.LotFrontage.mode()[0])
    df['MasVnrArea']=df.MasVnrArea.fillna(0.0)
    
    #convert data type
    #we are being little lineant to give int64 for YearBuilt, YrSold but those guys are going to be box-coxed 
    #so let them at least enjoy the bigger size for now
    int64_variables = ['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', \
                     'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', \
                     'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',\
                     'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', \
                     'PoolArea', 'MiscVal', 'YrSold', 'SalePrice']
    
    for c in int64_variables:
        df[c] = df[c].astype(np.int64)
        
    int_to_categorical_variables = ['MSSubClass', 'OverallQual', 'OverallCond', 'FireplaceQu', 'MoSold']
    for c in int_to_categorical_variables:
        df[c] = df[c].astype(str)
        
    df = df.fillna('NotAvailable')
    return df
df = giveMeWrangledData('train.csv')
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65,8450,Pave,NotAvailable,Reg,Lvl,AllPub,Inside,...,0,NotAvailable,NotAvailable,NotAvailable,0,2,2008,WD,Normal,208500
1,20,RL,80,9600,Pave,NotAvailable,Reg,Lvl,AllPub,FR2,...,0,NotAvailable,NotAvailable,NotAvailable,0,5,2007,WD,Normal,181500
2,60,RL,68,11250,Pave,NotAvailable,IR1,Lvl,AllPub,Inside,...,0,NotAvailable,NotAvailable,NotAvailable,0,9,2008,WD,Normal,223500
3,70,RL,60,9550,Pave,NotAvailable,IR1,Lvl,AllPub,Corner,...,0,NotAvailable,NotAvailable,NotAvailable,0,2,2006,WD,Abnorml,140000
4,60,RL,84,14260,Pave,NotAvailable,IR1,Lvl,AllPub,FR2,...,0,NotAvailable,NotAvailable,NotAvailable,0,12,2008,WD,Normal,250000


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 79 columns):
MSSubClass       1460 non-null object
MSZoning         1460 non-null object
LotFrontage      1460 non-null int64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            1460 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null object
OverallCond      1460 non-null object
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 n

In [9]:
def preProcessData(df):
    
    print("Shape of the data set before pre processing : ", df.shape )

    
    #get dummies
    print("Categorical columns : ", list(df.select_dtypes(exclude=np.number)))
    df = pd.get_dummies(df)
    #df = df.drop(categorical_columns, axis=1)
    
    print("\n\nShape of the data set after pre processing : ", df.shape )
    
    print("Columns in the data set are : ",list(df))

    return df
df_prep = preProcessData(df)
df_prep.info()

Shape of the data set before pre processing :  (1460, 79)
Categorical columns :  ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'MoSold', 'SaleType', 'SaleCondition']


Shape of the data set after pre processing :  (1460, 346)
Columns in the data set are :  ['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',

In [10]:
def newBoxCoxTranformation(df,target):
    
    #assuming that only numerical features are presented
    print("Shape of the dataset before transformation : ", df.shape)

    y = np.array(df[target].apply( lambda x: math.log(x)))
    X= df.drop(target,axis = 1)
    #x_columns = list(X)
    X = preprocessing.MinMaxScaler(feature_range=(1, 2)).fit_transform(X)
    X = preprocessing.power_transform( X, method='box-cox')
    #X = pd.DataFrame(X,columns=x_columns)
    print("Shape of the dataset after transformation : ", X.shape, y.shape)

    return X,y
X,y = newBoxCoxTranformation(df_prep,'SalePrice')

Shape of the dataset before transformation :  (1460, 346)


  return self.partial_fit(X, y)
  llf -= N / 2.0 * np.log(np.sum((y - y_mean)**2. / N, axis=0))


Shape of the dataset after transformation :  (1460, 345) (1460,)


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.5, random_state=random.randint(1,500), stratify=df.BldgType)

In [23]:
reg = XGBRegressor()
reg.fit(X_train,y_train)
reg.score(X_test,y_test)

0.8642175829114891

In [24]:
reg

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [20]:
np.sqrt(mean_squared_log_error(y_test, reg.predict(X_test)))

0.01032466379192143

##### We need to have different pre-processing logic to test data. We will come back to it little later.

def checkTheTestFile():
    df = giveMeWrangledData('test.csv')
    print(df.info())
    df = preProcessData(df)
    print(df.info())
    X,y = newBoxCoxTranformation(df,'SalePrice')
    print(np.sqrt(mean_squared_log_error(y, reg.predict(X))))
checkTheTestFile()