In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import datetime

import matplotlib.pyplot as plt
from IPython import display

from scipy import stats
import math
import random

from sklearn import preprocessing
from sklearn.model_selection import ShuffleSplit, train_test_split, cross_val_score, StratifiedShuffleSplit
from sklearn.metrics import  mean_squared_log_error

from xgboost import XGBRegressor

In [None]:
train_File = '../input/train.csv'
test_File = '../input/test.csv'

In [None]:
dd = display.display

# 1. Gather

In [None]:
def loadData():
    df_train = pd.read_csv(train_File)
    df_test = pd.read_csv(test_File)
    
    df = pd.concat([df_train, df_test], axis=0,sort=True,ignore_index=True)
    
    return df

df_before_clean = loadData()
dd(df_before_clean)

# 2. Assess Data : Inspecting Data for Quality and Tidiness Issues
#### 2.1 Quality Issues : Issues with content - missing, duplicate or incorrect data. a.k.a Dirty data 
* 2.1.a Completeness : *"Are there any rows, columns or cells missing values?"*
  * 35 columns have the missing values: 
  
  ['Alley', 'BsmtCond', 'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFinType1', 'BsmtFinType2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtQual', 'BsmtUnfSF', 'Electrical', 'Exterior1st', 'Exterior2nd', 'Fence', 'FireplaceQu', 'Functional', 'GarageArea', 'GarageCars', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'GarageYrBlt', 'KitchenQual', 'LotFrontage', 'MSZoning', 'MasVnrArea', 'MasVnrType', 'MiscFeature', 'PoolQC', 'SalePrice', 'SaleType', 'TotalBsmtSF', 'Utilities']
######   
* 2.1.b Validity : *"Does the data comply to the data schema like duplicate patient id or zip code being < 5 digits or float data type?"*
######   
   * Following are Categorical Variables but currently are being considered as integer/float:

        * MSSubClass
        * OverallQual
        * OverallCond
        * FireplaceQu
        * MoSold
######   
   * Following variables are supposed to be Integer type but Box-Cox or Scaling will anyway type cast them to float:

      * LotFrontage, LotArea, YearBuilt, YearRemodAdd, MasVnrArea, BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, 1stFlrSF, 2ndFlrSF, LowQualFinSF, GrLivArea, BsmtFullBath, BsmtHalfBath, FullBath, HalfBath, BedroomAbvGr, KitchenAbvGr, TotRmsAbvGrd, Fireplaces, GarageCars, GarageArea, WoodDeckSF, OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch, PoolArea, MiscVal, YrSold, SalePrice
   * GarageYrBlt will be NA when garage is not available for the house and hence this variable needs to be dropped as it does not comply to the schema.
   
######      
  
* 2.1.c Accuracy : *"Wrong data that is valid. like hieght = 300 inches; it still complies to the standard i.e. inches but data is in accurate."*
######   
    * MSZoning has 4 missing entries. Also, we do not have data samples to Agricultre (A),  Industrial (I), Residential Low Density Park (RP). Therefore, there exist a probability that missing entries will be replaced with wrong data which are valid for the variables.
######   
    * Similarly, Utilites has 2 missing entries. Also, we do not have data samples for NoSewr ((Electricity, Gas, and Water (Septic Tank)) and ELO (Electricity only)
######   
    * Exterior1st, has no missing values in training data set. But has no samples for 'Other' and 'PreCast' Material. Testing data has one missing sample for the variable.
######   
    * Exterior2nd has no missing values in the training data set. But has no samples for "PreCast" material. Testing data has one missing sample for the variable.
######   
    * MasVnrType has 8 missing values in training data set. It has total of 5 valid values.But there are no samples for "CBlock" (Cylinder block). One in testing set also has a record with missing value for this variable.
######   
    * ExterQual has no samples for "Po" but fortunately there are no missing values for it in both training or testing data.
######   
    * BsmtQual has 37 missing entries. It has no samples for "Po" ((Poor (<70 inches)). Testing data set has 46 missing entries.
######   
    * BsmtCond has 37 missing entries. It has no samples for "Ex" (Excellent). There are 46 missing entries in the testing dataset.
######   
    * KitchenQual has no training samples on "Po" but testing sample has a missing entry.
    * Functional has no training samples for "Po" but testing sample has a missing entry.
    * PoolQC has no samples for "Typ" but testing sample has missing values.
    * SaleType has no samples for "VWD" but testing sample has a missing value record.
######   
######   
* 2.1.d Consistency : *"Both valid and accurate but inconsistent. state = california and CA"*
######   
  * BsmtExposure has training samples as NA for both No Basement and also for missing values. There are also 2 testing samples with missing values as NA.
######   
  * BsmtFinType2 has training samples as NA for both No Basement and also for missing values.
######   
  * TotalBsmtSF has both 0 and NA representing as missing basement.
######   
  * BsmtExposure has NA for both missing and no basement houses.
  * BsmtFinType2 has NA for both missing and no basement for a house.
  


#### 2.2 Tidiness Issues: Issues with structure - untidy or messy data
* 2.2.a Each observation is a row
  * No Issues: Each observation is a unique house (no duplicate records)
######   
* 2.2.b Each variable is a column
  * No Issues: There are no colummns with multi data or concatenated data.
######  
* 2.2.c Each observational unit is a table
  * No Issues: There are no cross referring keys present in the table. Bsmt* and Garage* variables do form a logical group but there is no unique identities to the group.
######   

##### Hypothesis 1: Bsmt__ variables are NA when TotalBsmtSF is 0
##### Proof:
* BsmtFinType1 is NA when TotalBsmtSF is 0
* BsmtUnfSF is 0 whenever TotalBsmtSF is 0; Even in testing set it is NA only when TotalBsmtSF is NA
* BsmtFullBath is 0 whenever TotalBsmtSF is 0; Even in testin set it is NA only when TotalBsmtSF is NA or 0.
* BsmtHalfBath is 0 whenever TotalBsmtSF is 0; Even in testin set it is NA only when TotalBsmtSF is NA or 0.

##### Key Observations:
* Dataset has House Prices which were sold in between 2006 - 2010.
* Surprised to see no bathroom and no bedroom but with kitchen Houses !!! Where do they sleep and shit after the heavy meal ?
* NA value in GarageType can be easily mis interpreted as missing value. However, it is not true. NA in GarageType clearly indicates no garage because in both train and testing dataset GarageArea = 0 in all those cases. Similarly, GarageYrBlt, GarageFinish, GarageQual, GarageCond are also NA when GarageArea = 0. And GarageCars,GarageArea = 0 ==> GarageArea = 0.
* How do we verify NA in Fence as missing entry or No Fence ??
* How do we verify NA in MiscFeatures as None or missing value ?? Note that MiscVal is zero for NA, Othr & Shed.

### 2.1.a Completeness : *"Are there any rows, columns or cells missing values?"*

In [None]:
def missingValueAssessment(df):
    nan_columns = df.columns[df.isna().any()].tolist()
    print('NaN columns :', nan_columns, "\n# :", len(nan_columns))
    
    print("Duplicated rows count: ", df[df.duplicated()].shape)
    df = df.fillna('NA')
    print("Duplicated rows count: ", df[df.duplicated()].shape)
    
missingValueAssessment(df_before_clean)

### 2.2.a Each observation is a row

In [None]:
def checkHouseIsRepeated(df):
    df_temp = df.groupby(['SalePrice','GrLivArea','YearBuilt','YearRemodAdd']).agg('count').reset_index()[['SalePrice','GrLivArea','YearBuilt','YearRemodAdd','Id']]
    dd("Samples with same 'SalePrice','GrLivArea','YearBuilt','YearRemodAdd' : ",df_temp[df_temp.Id > 1])
    
    
checkHouseIsRepeated(df_before_clean)

* As there are no time series data: as in there is no variable indicating the time of the reading carried out, It is safe to assume the reading was done at one shot and there would not be any duplicate entries of a house.
* With the above assumption, group by 'SalePrice','GrLivArea','YearBuilt','YearRemodAdd' count indicates that there are no duplicate records.

In [None]:
df_before_clean.info()

# 3.0 Cleaning Data

##### Let us first do the cleaning activities where we have high confidence of imputing the values as listed in the above Assessment summary.

In [None]:
def cleanStage1(df):
    
    #convert data type
    #we are being little lineant to give float64 for YearBuilt, YrSold but those guys are going to be box-coxed 
    #so let them at least enjoy the bigger size for now
    float64_variables = ['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', \
                     'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', \
                     'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',\
                     'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', \
                     'PoolArea', 'MiscVal', 'YrSold', 'SalePrice']
    
    for c in float64_variables:
        df[c] = df[c].astype(np.float64)
    
    int_to_categorical_variables = ['MSSubClass', 'OverallQual', 'OverallCond', 'FireplaceQu', 'MoSold']
    for c in int_to_categorical_variables:
        df[c] = df[c].astype(str)
        
    
    #Blunt Initialization
    #df = df.fillna("NotAvailable")
    
    #TotalBsmtSF == NotAvailable
    df['TotalBsmtSF'] = df.TotalBsmtSF.fillna("NotAvailable")
    df['TotalBsmtSF'] = df.TotalBsmtSF.apply(lambda x: 0 if "NotAvailable" == x else x)
    
    #BsmtQual	BsmtCond	BsmtExposure	BsmtFinType1	BsmtFinType2
    df['BsmtQual'] = df.apply(lambda x: "NA" if x['TotalBsmtSF'] == 0 else x['BsmtQual'], axis=1)
    df['BsmtCond'] = df.apply(lambda x: "NA" if x['TotalBsmtSF'] == 0 else x['BsmtCond'], axis=1)
    df['BsmtExposure'] = df.apply(lambda x: "NA" if x['TotalBsmtSF'] == 0 else x['BsmtExposure'], axis=1)
    df['BsmtFinType1'] = df.apply(lambda x: "NA" if x['TotalBsmtSF'] == 0 else x['BsmtFinType1'], axis=1)
    df['BsmtFinType2'] = df.apply(lambda x: "NA" if x['TotalBsmtSF'] == 0 else x['BsmtFinType2'], axis=1)
    #BsmtFullBath	BsmtHalfBath
    df['BsmtFullBath'] = df.apply(lambda x: 0 if x['TotalBsmtSF'] == 0 else x['BsmtFullBath'], axis=1)
    df['BsmtHalfBath'] = df.apply(lambda x: 0 if x['TotalBsmtSF'] == 0 else x['BsmtHalfBath'], axis=1)
    
    #BsmtFinSF1	BsmtFinSF2	BsmtUnfSF
    df['BsmtFinSF1'] = df.apply(lambda x: 0 if x['TotalBsmtSF'] == 0 else x['BsmtFinSF1'], axis=1)
    df['BsmtFinSF2'] = df.apply(lambda x: 0 if x['TotalBsmtSF'] == 0 else x['BsmtFinSF2'], axis=1)
    df['BsmtUnfSF'] = df.apply(lambda x: 0 if x['TotalBsmtSF'] == 0 else x['BsmtUnfSF'], axis=1)
    
    #GarageYrBlt	GarageFinish GarageQual	GarageCond GarageCars	GarageArea
    #df['GarageType'] = df.GarageType.apply(lambda x: "NA" if "NotAvailable" == x else x)
    df['GarageType'] = df.GarageType.fillna("NA")
    #df['GarageYrBlt'] = df.apply(lambda x: "NA" if x['GarageType'] == "NA" else x['GarageYrBlt'], axis=1)
    df['GarageFinish'] = df.apply(lambda x: "NA" if x['GarageType'] == "NA" else x['GarageFinish'], axis=1)
    
    df['GarageQual'] = df.apply(lambda x: "NA" if x['GarageType'] == "NA" else x['GarageQual'], axis=1)
    df['GarageCond'] = df.apply(lambda x: "NA" if x['GarageType'] == "NA" else x['GarageCond'], axis=1)
    
    df['GarageCars'] = df.apply(lambda x: 0 if x['GarageType'] == "NA" else x['GarageCars'], axis=1)
    df['GarageArea'] = df.apply(lambda x: 0 if x['GarageType'] == "NA" else x['GarageArea'], axis=1)
    
    #drop obsolete columns
    df = df.drop(['Id','GarageYrBlt'], axis=1)
    df['SalePrice'] = df.SalePrice.fillna(0)
    return df

df_stage_1 = cleanStage1(df_before_clean.copy())
df_stage_1.info()

##### We had map when TotalBsmtSF =0 , there are couple of entries when it is TotalBsmtSF != 0. We may have to other Bsmt- attributes to impute the values for it but we will do such analysis after the straight forward missing value imputation.

In [None]:
df_stage_1[df_stage_1.BsmtQual.isna()]

##### We have now 27 columns to look after for the first round of cleaning.

In [None]:
len(df_stage_1.columns[df_stage_1.isna().any()].tolist()), df_stage_1.columns[df_stage_1.isna().any()].tolist()

In [None]:
def plotNAs(df):
    nan_columns = df.columns[df.isna().any()].tolist()
    #nan_columns.remove('SalePrice')
    for c in df.fillna('NotAvailable')[nan_columns]:
        df[[c,'SalePrice']].fillna('NotAvailable').\
        groupby(by=c).agg('count').\
        plot.bar(legend=None, title="Frequency Plot for "+c)
        plt.xticks(rotation=45)
        plt.show()
plotNAs(df_stage_1.copy())

### 1. Alley

In [None]:
def outerLandScape(df_temp):
    df = df_temp.copy()
    print("Initial Shape : ", df.shape)
    
    beyond_house = ['Neighborhood','Street', 'PavedDrive', 'Alley']
    df_temp = df.groupby(beyond_house).agg('count').reset_index()[['Neighborhood','Street', 'PavedDrive', 'Alley',"SalePrice"]]
    dd()
    
    def getAlley(Street, Neighborhood, PavedDrive):
        try:
            alley = df_temp[
                (df_temp['Street'] == Street ) &
                (df_temp['Neighborhood'] == Neighborhood ) &
                (df_temp['PavedDrive'] == PavedDrive) 
            ]['Alley'].tolist()[0]
        except:
            alley = 'NA'
        
        return alley
        
    
    df['Alley'] = df.Alley.fillna("NA")
    df[['Alley','SalePrice']].fillna(0).\
        groupby(by='Alley').agg('count').\
        plot.bar(legend=None, title="Frequency Plot for "+'Alley')
    plt.xticks(rotation=45)
    plt.show()
    
    na_alley_count = df[df.Alley == "NA"].shape[0]
    gr_alley_count = df[df.Alley == "Grvl"].shape[0]
    pa_alley_count = df[df.Alley == "Pave"].shape[0]
    
        
    df['Alley'] = df.apply( lambda x: getAlley (x['Street']
                                                ,x['Neighborhood']
                                                ,x['PavedDrive']
                                               ) if x['Alley'] == "NA" else x['Alley']
        ,axis=1)
    
    
    df[['Alley','SalePrice']].fillna(0).\
        groupby(by='Alley').agg('count').\
        plot.bar(legend=None, title="Frequency Plot for "+'Alley')
    plt.xticks(rotation=45)
    plt.show()
    
    
    print("Alley Snapshot Before : NA -", na_alley_count,"Grvl - ", gr_alley_count, "Pave - ", pa_alley_count)
    print("Alley Snapshot After  : NA -", df[df.Alley == "NA"].shape[0],
          "Grvl - ", df[df.Alley == "Grvl"].shape[0], "Pave - ", df[df.Alley == "Pave"].shape[0]
         )
    
    return df

df_alley = outerLandScape(df_stage_1)

In [None]:
def outerLandScape2():
    df_train = pd.read_csv(train_File)
    df_test = pd.read_csv(test_File)
    
    df = pd.concat([df_train, df_test], axis=0,sort=True,ignore_index=True)
    print("Initial Shape : ", df.shape)
    
    '''beyond_house = [ 'MSZoning','Street', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood',
                      'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'PavedDrive','SaleCondition','Fence', 'Alley']
    df_temp = df[beyond_house].copy()
    for c in beyond_house:
        df_temp = df_temp[df_temp[c].notna()]#.reset_index()
        print(c, df_temp.shape)'''
    #display.display(df_temp)
    
    #beyond_house = ['Neighborhood','Street', 'PavedDrive', 'Alley']
    #df = df.fillna('NNN')
    beyond_house = ['Neighborhood','Street','Alley']
    #beyond_house = ['Street', 'Alley']
    
    #display.display(df.groupby(beyond_house).agg('count').reset_index()[['Neighborhood','Street', 'PavedDrive', 'Alley','Id']])
    display.display(df.groupby(beyond_house).agg('count').reset_index()[['Neighborhood','Street', 'Alley','Id']])
    
    
outerLandScape2()

##### We can have the above table mapping missing Alley variable values. I am here assuming that Street and Alley would intersect. Therefore, there is a pattern with respect to location regarding the type of material used.

* However, the variable itself might not have such significance to target variable and hence can we can drop the imputation to it. The model which uses this variable would give lower significance level during training.

### 2. Electrical

In [None]:
def electricalWrangling(df_temp):
    df = df_temp.copy()
    df1 = df_temp.copy()
    print(df[df.Electrical.isna()].shape)
    df['Electrical'] = df.Electrical.fillna('NA')
    display.display(df[['Neighborhood','Electrical', 'SalePrice']].groupby(['Neighborhood','Electrical']).agg('count'))
    
    df1['Electrical'] = df1.Electrical.fillna('SBrkr')
    display.display(df1[['Neighborhood','Electrical', 'SalePrice']].groupby(['Neighborhood','Electrical']).agg('count'))
    return df1
    
df_electrical = electricalWrangling(df_stage_1.drop('Alley',axis=1))

##### At Timber, most of them have SBrkr as electrical system. So, it is a safe bet to have the missing entry replaced with 'SBrkr'

### 3. Exterior1st & Exterior2nd

In [None]:
def exteriorWrangle(df_temp):
    df = df_temp.copy()
    dd(df.shape)
    dd(df[df.Exterior1st == df.Exterior2nd].shape)
    dd(df[df.Exterior1st.isna()][['Exterior1st','Exterior2nd', 'Neighborhood','ExterQual', 'ExterCond', 'MSSubClass']])
    dd(df[df.Exterior2nd.isna()][['Exterior1st','Exterior2nd', 'Neighborhood','ExterQual', 'ExterCond','MSSubClass']])
    dt = df.\
       groupby(['Exterior1st','Exterior2nd', 'Neighborhood','ExterQual', 'ExterCond', 'MSSubClass']).\
       agg('count').reset_index().\
       sort_values(by=['MSSubClass','SalePrice'],ascending=False)[['Exterior1st','Exterior2nd', 'Neighborhood','ExterQual', 'ExterCond','MSSubClass','SalePrice']]
    #dd(dt)
    dd(dt[(dt.MSSubClass == '30') & (dt.Neighborhood == 'Edwards')].head())
    
    def bestExt1(Neighborhood,ExterQual, ExterCond, MSSubClass):
        return dt[(dt.Neighborhood == Neighborhood)&
                  (dt.ExterQual == ExterQual)&
                  (dt.ExterCond ==  ExterCond)&
                  (dt.MSSubClass == MSSubClass)]['Exterior1st'].tolist()[0]
    
    def bestExt2(Neighborhood,ExterQual, ExterCond, MSSubClass):
        return dt[(dt.Neighborhood == Neighborhood)&
                  (dt.ExterQual == ExterQual)&
                  (dt.ExterCond ==  ExterCond)&
                  (dt.MSSubClass == MSSubClass)]['Exterior2nd'].tolist()[0]
        
    df['Exterior1st'] = df.Exterior1st.fillna('NA')
    df['Exterior2nd'] = df.Exterior2nd.fillna('NA')
    
    df['Exterior1st'] = df.apply(lambda x: bestExt1(
                                    x['Neighborhood'], x['ExterQual'], x['ExterCond'], x['MSSubClass']
                                    ) if x['Exterior1st'] =="NA" else x['Exterior1st'], axis=1)
    df['Exterior2nd'] = df.apply(lambda x: bestExt2(
                                    x['Neighborhood'], x['ExterQual'], x['ExterCond'], x['MSSubClass']
                                    ) if x['Exterior2nd'] =="NA" else x['Exterior2nd'], axis=1)
    
    dd(df[df.index == 2151][['Exterior1st','Exterior2nd', 'Neighborhood','ExterQual', 'ExterCond', 'MSSubClass']])
    return df
    

    
df_ext = exteriorWrangle(df_electrical)

##### it is observed that many of the houses have Exterior1st and Exterior2nd same values per neighborhood. Therefore, we will create a matrix of neighborhood and Exterior1st. We will first populate Exterior1st from neighborhood value and then we will populate Exterior2nd from Exterior1st.

### 4. Fence

In [None]:
def fenceWrangling(df_temp):
    df = df_temp.copy()

    print("Count of missing Fence : ",df[df.Fence.isna()].shape)
    
    df['Fence'] = df.Fence.fillna('NA')
    
    #dd(df.groupby(['MSSubClass', 'Neighborhood','Fence', ]).agg('count').reset_index()[['MSSubClass', 'Neighborhood','Fence', 'SalePrice']])
    dd(df.groupby(['Neighborhood','Fence', ]).agg('count').reset_index()[['Neighborhood','Fence', 'SalePrice']])
    return df
    
df_fence = fenceWrangling(df_ext)

##### It is too tedious to decide if missing value indicates "NA" - No Fence or the entry was missing. What does actually Fence depend on ?
* For now i m going to rely on NA for the missing entry. Safest assumption.

##### It is too risky to map from othe rvariables. Though it seems like it depends on LotArea or LotFrontage. It is not very clear if it depends solely on one of the variable or sort of combination of others. Let us keep it NA for missing values, so that it would mean no fence available.

### 5.0 Functional

In [None]:
def functionalWrangle(df_temp):
    df = df_temp.copy()
   
    display.display(df[df.Functional.isna()].shape)
    
    co_qu_columns = [c for c in list(df) if ("Co" in c) or ("Qu" in c)]
    co_qu_columns.append('SalePrice')
    co_qu_columns.append('Functional')
    print(co_qu_columns)
    
    display.display(df[df.Functional.isna()])
    display.display(df[df.Functional.isna()][co_qu_columns])
    
    dd(df[(df.OverallCond == 5) & (df.OverallQual == 1)][co_qu_columns])
    dd(df[(df.OverallCond == 1) & (df.OverallQual == 4)][co_qu_columns])
    
    df['Functional'] = df.Functional.fillna('Typ')
    
    return df
    
df_funct = functionalWrangle(df_fence)


##### the rule itself says, assume typical unless deductions are warranted. However, there is no entry of salvage in our data set. Though it is not mandatory to have all categorical values has to be there in the dataset but it always raises the question why not that variable ? Can *Cond and *Qu variable give us hint of not 'Sal' ?

* I am actually tempted to put 'Sal' but due to lack of samples for Sal, I will be putting it as 'Typ'

### 6. LotFrontage

In [None]:
def LotFrontagecheck(df_temp):
    df = df_temp.copy()
    #df = pd.read_csv('train.csv')
    print(df[df.LotFrontage.isna()].shape)
    df_LotFrontage = df[['Neighborhood','LotFrontage']].groupby('Neighborhood').agg(lambda x:x.value_counts().index[0]).reset_index()
    df_dict = dict([tuple(x) for x in df_LotFrontage.values])
    print(df_dict)
    
    df['LotFrontage'] =df.LotFrontage.fillna(-1)
    df['LotFrontage'] =df.apply(lambda x: df_dict[x['Neighborhood']] if x['LotFrontage'] == -1 else x['LotFrontage'],axis=1)
    print(df[df.LotFrontage.isna()].shape)
    
    '''df.plot.scatter('LotFrontage', 'SalePrice')
    df.plot.scatter('LotArea', 'SalePrice')
    df.plot.scatter('LotArea', 'LotFrontage')
    
    display.display(df[df.LotFrontage > 175])'''
    return df
    
df_lot = LotFrontagecheck(df_funct) 

##### LotFrontage: taking neighborhood as reference most occuring distance is used for filling missing values. Inspiration: neighboring house have same distance to road /gate.

* Lot area > 10000 & LotFrontage > 200  seems like outliers

### 7. MSZoning

In [None]:
def msZoningWrangle(df_temp):
    df = df_temp.copy()
    
    zone_related = ['LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
                    'Condition2', 'BldgType', 'OverallQual', 'OverallCond', 'MSZoning'
                   ]
    
    temp  = df.groupby(['Neighborhood','MSSubClass','MSZoning']).\
    agg('count').reset_index().\
    sort_values(by=['SalePrice'],ascending=False)[['Neighborhood','MSSubClass','MSZoning','SalePrice']]
    dd(temp[(temp.Neighborhood == 'IDOTRR')  |(temp.Neighborhood == 'Mitchel')])
    
    def returnmsZone(Neighborhood,MSSubClass):
        return temp[(temp.Neighborhood == Neighborhood) &
                   (temp.MSSubClass == MSSubClass)]['MSZoning'].tolist()[0]
    
    dd(df[df.MSZoning.isna()][['Neighborhood','MSSubClass','MSZoning']])
    df['MSZoning'] = df.MSZoning.fillna("NA")
    df['MSZoning'] = df.apply(lambda x: returnmsZone(x['Neighborhood'], x['MSSubClass']) 
                              if x['MSZoning'] == "NA" else x['MSZoning'], axis=1)
    
    dd(df[df.index.isin([1915,2216,2250,2904])][['Neighborhood','MSSubClass','MSZoning']])

    return df

df_ms = msZoningWrangle(df_lot)    

##### MSZoning is general zoning classification. Therefore,  it must be specific to an area and hence 'Neighborhood' is the variable to our rescue. MSZoning = RL  when neighbor is 'Mitchel' and  RM when neighbor is  IDOTRR and they are is missing.

### 8.0 MasVnrType & MasVnrArea

In [None]:
def MasVnrTypeCheck(df_temp):
    #df = pd.read_csv('train.csv')
    df=df_temp.copy()
    print(df[df.MasVnrType.isna()].shape, )
    #display.display(df[['Neighborhood','MasVnrType','Id']].groupby(['Neighborhood','MasVnrType']).agg('count').reset_index())
    
    dd(df[['MasVnrType','MasVnrArea']][df.MasVnrType.isna()].head())
    dd(df[['MasVnrType','MasVnrArea']][df.MasVnrArea.isna()].head())
    dd(df[['MasVnrType','MasVnrArea']][df.MasVnrArea == 0].head())
    dd(df[['MasVnrType','MasVnrArea']][df.MasVnrType == 'None'].head())

    df['MasVnrType'] = df.MasVnrType.fillna("None")
    df['MasVnrArea'] = df.MasVnrArea.fillna(0)

    df['MasVnrArea'] = df['MasVnrArea'].astype(np.float64)

    return df
df_mas = MasVnrTypeCheck(df_ms)

* MasVnrArea nan count is equivalent to MasVnrType count.
* MasVnrArea == 0 is already present 
* whenever MasVnrArea == 0 MasVnrType is also None 
* Therefore, MasVnrArea will be mapped to zero when MasVnrType = None

##### Outlier: area > 1400 is only one sample which has low sale price. Its overall condition and quality is moderate and there are enough sample for those bands.

### 9. MiscFeature

In [None]:
def miscFeatureWrangle(df_temp):
    df = df_temp.copy()
    df_df = df.fillna('NA').groupby(['MiscFeature','MiscVal']).agg('count').reset_index()[['MiscFeature','MiscVal','SalePrice']]
    
    dd("Before :",df_df[df_df.MiscVal == 0])
    dd(df_df[df_df.MiscFeature == "NA"])
    #dd(df_df[df_df.MiscVal == "NA"])
    
    df['MiscFeature'] = df.apply(lambda x: "NA" if x['MiscVal'] == 0 else x['MiscFeature'],axis=1)
    df['MiscFeature'] = df.apply(lambda x: "Gar2" if x['MiscVal'] == 17000.0 else x['MiscFeature'],axis=1)
    
    df_df = df.fillna('NA').groupby(['MiscFeature','MiscVal']).agg('count').reset_index()[['MiscFeature','MiscVal','SalePrice']]
    
    dd("After :",df_df[df_df.MiscVal == 0])
    dd(df_df[df_df.MiscFeature == "NA"])
    return df
    
df_misc = miscFeatureWrangle(df_mas)

* When MiscVal == 0 ; MiscFeature is mostly NA (None). Note that it can be Shed or Other too. Will park it for next level fine tuning.
* High Values are dedicated to 'Gar2'. Therefore, testing set missing value is gar2 for sure.

### 10. PoolQC

In [None]:
def poolWrangling(df_temp):
    df=df_temp.copy()
    #df = pd.read_csv('train.csv')
    print("Count of missing PoolQC : ",df[df.PoolQC.isna()].shape)
    
    dd(df[df.PoolQC.isna()][['PoolArea', 'PoolQC','OverallCond','OverallQual']].head())
    #dd(df[df.PoolQC.isna()][['PoolArea', 'PoolQC','OverallCond','OverallQual']])
    dd(df[df.PoolArea == 0][['PoolArea', 'PoolQC','OverallCond','OverallQual']]['PoolQC'].unique())
    dd(df[df.PoolArea.isna()])
    
    dd(df.groupby(['OverallCond','OverallQual','PoolQC']).agg('count').reset_index()[['OverallCond','OverallQual','PoolQC','SalePrice']])
    df['PoolQC'] = df.PoolQC.fillna("NA")
    df['PoolQC'] = df.apply(lambda x: "Fa" if (x['PoolArea'] > 0) & (x['PoolQC'] == "NA") else x['PoolQC'],axis=1)
    #dd(df.fillna("NA").groupby(['PoolArea','PoolQC']).agg('count').reset_index()[['PoolArea','PoolQC','SalePrice']])
    dd(df[df.PoolQC.isna()][['PoolArea', 'PoolQC','OverallCond','OverallQual']])
    dd(Counter(df.PoolQC))
    
    return df
    
df_pool = poolWrangling(df_misc)

##### When PoolArea is 0 PoolQC will be NA (no pool). When PoolArea > 0 it appears to be good candidates for "Fa".

* when PoolQC should be "NA" when PoolArea = 0 [Thumb rule / Common sense]
* Missing values have characterstics matching with that of "Fair" condition. It may be "TA" but we dont have enough evidence or rather no evidence for that.

### 11. SaleType

In [None]:
def saleTypeWrangling(df):

    dd(df[df.SaleType.isna()][['MSSubClass', 'MSZoning', 'SaleCondition','SaleType']])
    df1 = df.groupby(['MSSubClass', 'MSZoning', 'SaleCondition','SaleType']).\
    agg('count').reset_index().sort_values('SalePrice',ascending=False)[['MSSubClass', 'MSZoning', 'SaleCondition','SaleType', 'SalePrice']]
    dd(df1[df1.MSSubClass == '20'])
    
    def popSaleType(MSSubClass, MSZoning, SaleCondition):
        return df1[(df1.MSSubClass == MSSubClass) &
                   (df1.MSZoning == MSZoning) &
                   (df1.SaleCondition == SaleCondition)
                  ]['SaleType'].tolist()[0]
    
    df['SaleType'] = df.SaleType.fillna("NA")
    df['SaleType'] = df.apply(lambda x: popSaleType(x['MSSubClass'], x['MSZoning'], x['SaleCondition']) 
                              if x['SaleType'] == "NA" else x['SaleType']
                              ,
                             axis = 1)
    dd(df[df.index == 2489][['MSSubClass', 'MSZoning', 'SaleCondition','SaleType']])
    
    return df
    
df_sale = saleTypeWrangling(df_pool.copy())    

### 12. Utilities

In [None]:
def utilWrangling(df):
    dd(df[df.Utilities.isna()][['Neighborhood','Utilities']])
    
    df1 = df.groupby(['Neighborhood','Utilities']).agg('count').reset_index().\
    sort_values('SalePrice',ascending=False)[['Neighborhood','Utilities','SalePrice']]
    dd(df1)
    
    def returnUtil(Neighborhood):
        return df1[(df1.Neighborhood == Neighborhood)]["Utilities"].tolist()[0]
    
    df['Utilities'] = df.Utilities.fillna("NA")
    df['Utilities'] = df.apply(lambda x: returnUtil(x['Neighborhood']) if x['Utilities'] == "NA" else x["Utilities"],axis=1)
    
    dd(df[df.index.isin([1915,1945])][['Neighborhood','Utilities']])
    return df
        
df_util = utilWrangling(df_sale.copy())

### 13. KitchenQual

In [None]:
def kitchenQual(df):
    
    dd(df[df.KitchenQual.isna()][['OverallCond', 'OverallQual', 'KitchenAbvGr', 'KitchenQual']])
    
    df1 = df.groupby(['OverallCond', 'OverallQual', 'KitchenAbvGr', 'KitchenQual']).agg('count').\
       reset_index()[['OverallCond', 'OverallQual', 'KitchenAbvGr', 'KitchenQual']]
    
    def returnkqual(OverallCond, OverallQual, KitchenAbvGr):
        return df1[(df1.OverallCond == OverallCond)&
                  (df1.OverallQual == OverallQual) &
                   (df1.KitchenAbvGr == KitchenAbvGr)
                  ]['KitchenQual'].tolist()[0]
    
    df['KitchenQual'] = df.KitchenQual.fillna("NA")
    df['KitchenQual'] = df.apply(lambda x: returnkqual(x['OverallCond'], x['OverallQual'],x['KitchenAbvGr'])
                                 if x['KitchenQual'] == "NA" else x['KitchenQual']
                                ,axis=1)
    dd(df[df.index == 1555][['OverallCond', 'OverallQual', 'KitchenAbvGr', 'KitchenQual']])
    return df

df_kitchen = kitchenQual(df_util.copy())

### Checkpoint - 1

In [None]:
plotNAs(df_kitchen.copy())

In [None]:
def bsmtInterpolate(df):
    bsmt_column = ['BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual']
    print(bsmt_column)
    
    bsmt_column2 = [c for c in list(df) if 'Bsmt' in c]
    
    dd(df[df.BsmtCond.isna()][bsmt_column])
    bsmt_index = []
    bsmt_index += list(df[df.BsmtCond.isna()].index)
    
    dd(df[df.BsmtExposure.isna()][bsmt_column])
    bsmt_index += list(df[df.BsmtExposure.isna()].index)
    
    dd(df[df.BsmtFinType2.isna()][bsmt_column])
    bsmt_index += list(df[df.BsmtFinType2.isna()].index)
    
    dd(df[df.BsmtQual.isna()][bsmt_column])
    bsmt_index += list(df[df.BsmtQual.isna()].index)
    
    df1 = df.groupby(bsmt_column).agg('count').reset_index()[(bsmt_column + ['SalePrice'])]
    df1 = df1.sort_values('SalePrice',ascending=False)
    #dd(df1)
    
    def getBsmtCond(BsmtExposure, BsmtFinType1, BsmtFinType2, BsmtQual):
        return df1[(df1.BsmtExposure == BsmtExposure)&
                   (df1.BsmtFinType1 == BsmtFinType1)&
                   (df1.BsmtFinType2 == BsmtFinType2)&
                   (df1.BsmtQual == BsmtQual)                   
                  ]['BsmtCond'].tolist()[0]
    
    def getBsmtExposure(BsmtCond, BsmtFinType1, BsmtFinType2, BsmtQual):
        return df1[(df1.BsmtCond == BsmtCond)&
                   (df1.BsmtFinType1 == BsmtFinType1)&
                   (df1.BsmtFinType2 == BsmtFinType2)&
                   (df1.BsmtQual == BsmtQual)                   
                  ]['BsmtExposure'].tolist()[0]
    
    def getBsmtFinType1(BsmtExposure, BsmtCond, BsmtFinType2, BsmtQual):
        return df1[(df1.BsmtExposure == BsmtExposure)&
                   (df1.BsmtCond == BsmtCond)&
                   (df1.BsmtFinType2 == BsmtFinType2)&
                   (df1.BsmtQual == BsmtQual)                   
                  ]['BsmtFinType1'].tolist()[0]
    
    def getBsmtFinType2(BsmtExposure, BsmtFinType1, BsmtCond, BsmtQual):
        return df1[(df1.BsmtExposure == BsmtExposure)&
                   (df1.BsmtFinType1 == BsmtFinType1)&
                   (df1.BsmtCond == BsmtCond)&
                   (df1.BsmtQual == BsmtQual)                   
                  ]['BsmtFinType2'].tolist()[0]
    
    def getBsmtQual(BsmtExposure, BsmtFinType1, BsmtFinType2, BsmtCond):
        return df1[(df1.BsmtExposure == BsmtExposure)&
                   (df1.BsmtFinType1 == BsmtFinType1)&
                   (df1.BsmtFinType2 == BsmtFinType2)&
                   (df1.BsmtCond == BsmtCond)                   
                  ]['BsmtQual'].tolist()[0]
    
    #['BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual']
    df['BsmtCond'] = df.BsmtCond.fillna("NA")
    df['BsmtCond'] = df.apply(lambda x: getBsmtCond(x['BsmtExposure'], x['BsmtFinType1'], x['BsmtFinType2'], x['BsmtQual'])
                              if x['BsmtCond'] == "NA" else x['BsmtCond']
                             , axis = 1)
    
    df['BsmtExposure'] = df.BsmtExposure.fillna("NA")
    df['BsmtExposure'] = df.apply(lambda x: getBsmtExposure(x['BsmtCond'], x['BsmtFinType1'], x['BsmtFinType2'], x['BsmtQual'])
                              if x['BsmtExposure'] == "NA" else x['BsmtExposure']
                             , axis = 1)
    
    df['BsmtFinType1'] = df.BsmtFinType1.fillna("NA")
    df['BsmtFinType1'] = df.apply(lambda x: getBsmtFinType1(x['BsmtExposure'], x['BsmtCond'], x['BsmtFinType2'], x['BsmtQual'])
                              if x['BsmtFinType1'] == "NA" else x['BsmtFinType1']
                             , axis = 1)

    df['BsmtFinType2'] = df.BsmtFinType2.fillna("NA")
    df['BsmtFinType2'] = df.apply(lambda x: getBsmtFinType2(x['BsmtExposure'], x['BsmtFinType1'], x['BsmtCond'], x['BsmtQual'])
                              if x['BsmtFinType2'] == "NA" else x['BsmtFinType2']
                             , axis = 1)
    
    df['BsmtQual'] = df.BsmtQual.fillna("NA")
    df['BsmtQual'] = df.apply(lambda x: getBsmtQual(x['BsmtExposure'], x['BsmtFinType1'], x['BsmtFinType2'], x['BsmtCond'])
                              if x['BsmtQual'] == "NA" else x['BsmtQual']
                             , axis = 1)
    
    dd(df[df.index.isin(bsmt_index)][bsmt_column])
    
    return df
df_bsmt_final = bsmtInterpolate(df_kitchen.copy())


### Garage*

In [None]:
def garageWrangling(df):
    garage_columns = [c for c in df.select_dtypes(exclude=np.number) if "Garage" in c]
    print(garage_columns)
    garage_col2 = [c for c in list(df) if "Garage" in c]
    grp_by = ['GarageType','MiscFeature','OverallQual','OverallCond','GarageFinish','GarageQual','GarageCond',\
              'GarageArea','GarageCars']
    
    new_col = list(set(garage_col2 + grp_by + ['MiscFeature']))
    
    miss_index = []
    for c in garage_col2:
        df_temp = df[df[c].isna()][new_col]
        dd(c,df_temp)
        miss_index += list(df_temp.index)
    
    
    df1 = df.groupby(grp_by).agg('count').reset_index().sort_values('SalePrice',ascending=False)[grp_by + ['SalePrice']]
    dd(df1[df1.GarageType == 'Detchd'])
    
    def getGarageArea( GarageType, MiscFeature, OverallQual, OverallCond):
        return df1[
                   (df1.GarageType == GarageType) &
                   (df1.MiscFeature == MiscFeature) &
                   (df1.OverallQual == OverallQual) &
                   (df1.OverallCond == OverallCond)
                  ]['GarageArea'].tolist()[0]
    
    def getGarageCars( GarageType, MiscFeature, OverallQual, OverallCond):
        return df1[
                   (df1.GarageType == GarageType) &
                   (df1.MiscFeature == MiscFeature) &
                   (df1.OverallQual == OverallQual) &
                   (df1.OverallCond == OverallCond)
                  ]['GarageCars'].tolist()[0]
    
    def getGarageCond(GarageType, MiscFeature, OverallQual, OverallCond):
        return df1[
                   (df1.GarageType == GarageType) &
                   (df1.MiscFeature == MiscFeature) &
                   (df1.OverallQual == OverallQual) &
                   (df1.OverallCond == OverallCond)
                  ]['GarageCond'].tolist()[0]
    
    def getGarageFinish(GarageType, MiscFeature, OverallQual, OverallCond):
        return df1[
                   (df1.GarageType == GarageType) &
                   (df1.MiscFeature == MiscFeature) &
                   (df1.OverallQual == OverallQual) &
                   (df1.OverallCond == OverallCond)
                  ]['GarageFinish'].tolist()[0]
    
    def getGarageQual(GarageType, MiscFeature, OverallQual, OverallCond):
        return df1[
                   (df1.GarageType == GarageType) &
                   (df1.MiscFeature == MiscFeature) &
                   (df1.OverallQual == OverallQual) &
                   (df1.OverallCond == OverallCond)
                  ]['GarageQual'].tolist()[0]
    
    df['GarageArea'] = df['GarageArea'].fillna("NA")
    df['GarageArea'] = df.apply(lambda x: getGarageArea( x['GarageType'], x['MiscFeature'], x['OverallQual'], x['OverallCond'])
                                if x['GarageArea'] =="NA" else x['GarageArea']
                                ,axis =1)
    
    df['GarageCars'] = df['GarageCars'].fillna("NA")
    df['GarageCars'] = df.apply(lambda x: getGarageCars( x['GarageType'], x['MiscFeature'], x['OverallQual'], x['OverallCond'])
                                if x['GarageCars'] =="NA" else x['GarageCars']
                                ,axis =1)
    
    df['GarageCond'] = df['GarageCond'].fillna("NA")
    df['GarageCond'] = df.apply(lambda x: getGarageCond( x['GarageType'], x['MiscFeature'], x['OverallQual'], x['OverallCond'])
                                if x['GarageCond'] =="NA" else x['GarageCond']
                                ,axis =1)
    
    df['GarageFinish'] = df['GarageFinish'].fillna("NA")
    df['GarageFinish'] = df.apply(lambda x: getGarageFinish( x['GarageType'], x['MiscFeature'], x['OverallQual'], x['OverallCond'])
                                if x['GarageFinish'] =="NA" else x['GarageFinish']
                                ,axis =1)
    
    df['GarageQual'] = df['GarageQual'].fillna("NA")
    df['GarageQual'] = df.apply(lambda x: getGarageQual( x['GarageType'], x['MiscFeature'], x['OverallQual'], x['OverallCond'])
                                if x['GarageQual'] =="NA" else x['GarageQual']
                                ,axis =1)
    
    dd(df[df.index.isin(miss_index)][new_col])
    return df
df_garage_final = garageWrangling(df_bsmt_final.copy())


### Checkpoint 2

In [None]:
plotNAs(df_garage_final.copy())

In [None]:
did_we_miss_them = ['MSSubClass', 'OverallQual', 'OverallCond', 'FireplaceQu', 'MoSold']
for c in did_we_miss_them:
    dd(c, df_garage_final[c].unique())

In [None]:
def FireplaceWrangling(df_temp):
    df = pd.read_csv(train_File)
    print("Count of missing FireplaceQu : ",df[df.FireplaceQu.isna()].shape[0])
    dd("Count of Fireplaces == 0",df[['Fireplaces', 'FireplaceQu']][df.Fireplaces == 0].shape)
    dd(df[['Fireplaces', 'FireplaceQu']][df.FireplaceQu.isna()].head())
    dd("When FireplaceQu == NA, Fireplaces:", df[['Fireplaces', 'FireplaceQu']][df.FireplaceQu.isna()]['Fireplaces'].unique())
    dd("When Fireplaces == 0, FireplaceQu:",df[['Fireplaces', 'FireplaceQu']][df.Fireplaces == 0]['FireplaceQu'].unique())
    
    #now we have NA as nan in our wrangled dataset; ideally this should not be problem but let us be consistent.
    df = df_temp.copy()
    
    df['FireplaceQu'] = df.apply(lambda x: "NA" if x['Fireplaces'] == 0 else x['FireplaceQu'], axis=1 )
    
    dd("Post imputing :",df.FireplaceQu.unique())
    
    return df
    
df_fire = FireplaceWrangling(df_garage_final.copy())

##### FireplaceQu will be mapped to NA (no fireplace) whenever Fireplaces = 0

##### TotalBsmtSF = 0 indicates there is no basement. 
* Therefore, BsmtQual = BsmtCond = BsmtExposure = BsmtFinType1 = BsmtFinType2 = "NA"; when TotalBsmtSF = 0 

* Outlier: (df.BsmtFinSF1 > 2000) & (df.SalePrice < 200000) (2 of them) are outlier because it not only brings the co relation down but also there are enough samples for outlier's overall condition and quality samples.

##### Let us check the datatype if it is corrupted due to our imputate operation

In [None]:
post_imputing_cols = list(df_fire.select_dtypes(include=np.object))
cols_need_change = [c for c in list(df_before_clean.select_dtypes(include=np.number)) if c in post_imputing_cols]

#dd(df_before_clean[df_before_clean.MasVnrArea == 'BrkFace'])

'''for c in post_imputing_cols:
    if c in cols_need_change:
        print(c)
        df_fire[c] = df_fire[c].astype(np.float64)'''
print()

In [None]:
post_imputing_cols = list(df_fire.select_dtypes(include=np.number))
[c for c in list(df_before_clean.select_dtypes(include=np.number)) if c not in post_imputing_cols]

In [None]:
post_imputing_cols = list(df_fire.select_dtypes(include=np.object))
[c for c in list(df_before_clean.select_dtypes(include=np.object)) if c not in post_imputing_cols]

In [None]:
post_imputing_cols = list(df_fire.select_dtypes(include=np.number))

#for c in post_imputing_cols:
_ = [dd(c,df_fire[c].unique()) for c in list(df_fire.select_dtypes(exclude=np.number))]
    


### Outliers

In [None]:
print("Shape of the imputed dataset : ", df_fire.shape)
df_out = df_fire[df_fire.SalePrice > 0].copy()
print("Shape of the Outlier Analysis dataset : ", df_out.shape)

In [None]:
def computeCorrCols(df):
    df_corr = df.corr()
    
    df1 = df_corr.stack().reset_index().rename(columns={'level_0': "C1", "level_1": "C2", 0 : "Corr_val"})
    
    df1['Corr_val']= df1['Corr_val'].abs()
    df1 = df1[df1['Corr_val'] < 1].sort_values('Corr_val',ascending=False)
    df1 = df1.drop_duplicates('Corr_val').reset_index(drop=True)
    dd(df1)
    
    return df1
df_corr = computeCorrCols(df_out.copy())


def checkCorr(df_old):
    df = pd.get_dummies(df_old)
    corr_target = []
    df_corr = df.corr()
    for c in list(df_corr):
        corr_target.append((c, np.abs(df_corr.loc[c,'SalePrice'])))

    dd("Top 15 Numerical Variables with High Corr value for SalePrice :",
        [x for x in sorted(corr_target,key=lambda x: x[1], reverse=True) if '_' not in x[0]][1:15])
    
    dd("Top 15 Variables (All Types) with High Corr value for SalePrice :",
        [x for x in sorted(corr_target,key=lambda x: x[1], reverse=True)][1:15])

checkCorr(df_out.copy())

##### There may be so many outliers but let us target the ones which are not only numerical but also impact the target variables. Let us target top 5 variables for outlier removal. i.e. GrLivArea, GarageCars, GarageArea, TotalBsmtSF & 1stFlrSF

In [None]:
def topCorrWithOthers(df):
    top_5_corr = ['GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF']
    print("Co relation of the top 5 columns with others : ")
    for c in top_5_corr:
        dd(df_corr[(df_corr.C1 == c) | (df_corr.C2 == c)].head())
        
topCorrWithOthers(df_fire.copy())

In [None]:
df_out['SalePrice'] = df_out['SalePrice'] / df_out.GrLivArea

### 1. GrLivArea

In [None]:
def grLivAreaOutlier(df):
    df.plot.scatter("GrLivArea", "SalePrice", title="Before Removal of Outlier")
    df_corr = df.corr()
    print("Co relation before removing the outlier : ",df_corr.loc['GrLivArea','SalePrice'])
    df1 = df[(df.GrLivArea < 4000) | (df.SalePrice >250000 )]
    print("Outlier Count: ", df.shape[0]-df1.shape[0])
    df1.plot.scatter("GrLivArea", "SalePrice", title="After Removal of Outlier")
    
    #dd(corr(df1.GrLivArea, df1.SalePrice))
    
    df_corr = df1.corr()
    print("Co relation After removing the outlier : ",df_corr.loc['GrLivArea','SalePrice'])
    
    return df1
df_liv = grLivAreaOutlier(df_out.copy())

### 2. GarageCars

In [None]:
def garageCarsOutlier(df):
    df.plot.scatter("GarageCars", "SalePrice", title="Before Removal of Outlier")
    df_corr = df.corr()
    print("Co relation before removing the outlier : ",df_corr.loc['GarageCars','SalePrice'])
    
    df1 = df[(df.GarageCars < 4)]
    
    print("Outlier Count : ", df[(df.GarageCars == 4)].shape[0])
    df_corr = df1.corr()
    print("Co relation After removing the outlier : ",df_corr.loc['GarageCars','SalePrice'])
    df1.plot.scatter("GarageCars", "SalePrice", title="Before Removal of Outlier")
    return df1
    
df_gar_car = garageCarsOutlier(df_liv.copy())

### 3.GarageArea

In [None]:
def garageAreaOutlier(df):
    df.plot.scatter("GarageArea", "SalePrice", title="Before Removal of Outlier")
    df_corr = df.corr()
    print("Co relation before removing the outlier : ",df_corr.loc['GarageArea','SalePrice'])
    
    df1 = df[(df.GarageArea < 1200) | (df.SalePrice > 300000)]
    
    print("Outlier Count : ", df[(df.GarageArea > 1200) & (df.SalePrice < 300000)].shape[0])
    df_corr = df1.corr()
    print("Co relation After removing the outlier : ",df_corr.loc['GarageArea','SalePrice'])
    df1.plot.scatter("GarageArea", "SalePrice", title="Before Removal of Outlier")
    
    return df1

df_gar_area = garageAreaOutlier(df_gar_car.copy())

### 4. TotalBsmtSF

In [None]:
def totalBsmtSFOutlier(df):
    df.plot.scatter("TotalBsmtSF", "SalePrice", title="Before Removal of Outlier")
    df_corr = df.corr()
    print("Co relation before removing the outlier : ",df_corr.loc['TotalBsmtSF','SalePrice'])
    df[df.TotalBsmtSF >3000].plot.scatter("TotalBsmtSF", "SalePrice", title="Zoomed look on Outlier")
    df[df.TotalBsmtSF >3100].plot.scatter("TotalBsmtSF", "SalePrice", title="Further Zoomed look on Outlier")
    
    df1 = df[(df.TotalBsmtSF < 3000) | (df.SalePrice > 300000)]
    
    print("Outlier Count : ", df[(df.TotalBsmtSF > 3000) & (df.SalePrice < 300000)].shape[0])
    df_corr = df1.corr()
    print("Co relation After removing the outlier : ",df_corr.loc['TotalBsmtSF','SalePrice'])
    df1.plot.scatter("TotalBsmtSF", "SalePrice", title="Before Removal of Outlier")
    
    return df1

df_tot = totalBsmtSFOutlier(df_gar_area.copy())

### 5. 1stFlrSF

In [None]:
def firststFlrSFOutlierCheck(df):
    df.plot.scatter("1stFlrSF", "SalePrice", title="Before Removal of Outlier")
    df_corr = df.corr()
    print("Co relation before removing the outlier : ",df_corr.loc['1stFlrSF','SalePrice'])
    df[df['1stFlrSF'] >2500].plot.scatter("1stFlrSF", "SalePrice", title="Zoomed look on Outlier")
    #df[df.TotalBsmtSF >3100].plot.scatter("1stFlrSF", "SalePrice", title="Further Zoomed look on Outlier")
    
    df1 = df[(df['1stFlrSF'] < 3000) ]
    
    print("Outlier Count : ", df[(df['1stFlrSF'] > 3000)].shape[0])
    df_corr = df1.corr()
    print("Co relation After removing the outlier : ",df_corr.loc['1stFlrSF','SalePrice'])
    df1.plot.scatter("1stFlrSF", "SalePrice", title="Before Removal of Outlier")
    
    return df1
    
df_1stFlrSF = firststFlrSFOutlierCheck(df_tot.copy())

In [None]:
#checkCorr(df_tot.copy())

In [None]:
#checkCorr(df_1stFlrSF.copy())

#### On comparing the results from the co relation from the above 2 cells, it is clear that we [*should not*] go for the 1stFlrSF outlier removal.

#### 2.2 Wrangling
* should we try to see all the outlier or only the ones which have high co relation with target variable ?
* have both test and train data for data missing and imputation activities and NOT for outlier removal.
* is there any way we can identify the outlier in categorical values ?

def interpolateAtLast(df):
   
    df[['Alley','SalePrice']].fillna('NotAvailable').\
    groupby(by='Alley').agg('count').\
    plot.bar(legend=None, title="Frequency Plot for Alley")
    plt.xticks(rotation=45)
    plt.show()
    df = df.interpolate()
    
    df[['Alley','SalePrice']].fillna('NotAvailable').\
    groupby(by='Alley').agg('count').\
    plot.bar(legend=None, title="Frequency Plot for Alley")
    plt.xticks(rotation=45)
    plt.show()
    
interpolateAtLast(df_fire.copy())

In [None]:
def interpolateCheck():
    df_train = pd.read_csv(train_File)
    df_test = pd.read_csv(test_File)
    
    df = pd.concat([df_train, df_test], axis=0,sort=True,ignore_index=True)
    print("Initial Shape : ", df.shape)
    
    nan_columns_before = df.columns[df.isna().any()].tolist()
    
    bsmt_columns = [c for c in list(df) if 'Bsmt' in c]
    gar_columns = [c for c in list(df) if 'Garage' in c]
    
    df_inter = df.interpolate()
    nan_columns = df_inter.columns[df_inter.isna().any()].tolist()
    
    interpolated_columns = [c for c in nan_columns_before if c not in nan_columns]
    
    display_columns = list(set(bsmt_columns + interpolated_columns + gar_columns))
    
    print("These are interpolated : ", interpolated_columns)
    
    for c in interpolated_columns:
        na_indices = df[df[c].isna()].index.tolist()
        print(c,na_indices)
        
        if 'Bsmt' in c:
            display.display(df[df.index.isin(na_indices)][bsmt_columns])
            display.display(df_inter[df_inter.index.isin(na_indices)][bsmt_columns])
        else: 
            display.display(df[df.index.isin(na_indices)][gar_columns])
            display.display(df_inter[df_inter.index.isin(na_indices)][gar_columns])
        #break
    
    '''for c in df.fillna('NotAvailable')[nan_columns]:
        df[[c,'SalePrice']].fillna('NotAvailable').\
        groupby(by=c).agg('count').\
        plot.bar(legend=None, title="Frequency Plot for "+c)
        plt.xticks(rotation=45)
        plt.show()'''
interpolateCheck()

##### We should not blindly depend on the pandas interpolate outcome. I guess we should use the method parameters that interpolate uses and then use it with a understanding. e.g.: ‘krogh’, ‘piecewise_polynomial’, ‘spline’, ‘pchip’ and ‘akima’ methods  in scipy [and thus in panda].

* TotalBsmtSF being null indicate no basement and should not impute values based on neighbor (i guess)
* basement full and half bathroom being .5 makes no sense.
* garage year built in decimals does not makes sence.

##### Let us check with XGBoost for the score.

In [None]:
def preProcessData(df, log=False):
    
    print("Shape of the data set before pre processing : ", df.shape )

    #get dummies
    if log:
        print("Categorical columns : ", list(df.select_dtypes(exclude=np.number)))
    df = pd.get_dummies(df, dtype=np.float64)
    
    print("\n\nShape of the data set after pre processing : ", df.shape )
    
    if log:
        print("Columns in the data set are : ",list(df))

    return df

df_prep = preProcessData(df_fire.copy())
df_prep.info()

In [None]:
def getOutlierIndices(df):
    
    print("Shape before removing outlier : ", df.shape)
    '''df = df[(df.GrLivArea < 4000) | (df.SalePrice >250000) | (df.SalePrice == 0) ]
    df = df[(df.GarageCars < 4) | (df.SalePrice == 0)]
    df = df[((df.GarageArea < 1200) | (df.SalePrice > 300000)) | (df.SalePrice ==0)]
    df = df[((df.TotalBsmtSF < 3000) | (df.SalePrice > 300000)) | (df.SalePrice ==0)]'''
    df = df[(df.GrLivArea < 3000)| (df.SalePrice ==0)]
    df = df[(df.SalePrice < 200)| (df.SalePrice ==0)]
    print("Shape after removing outlier : ", df.shape)
    
    return df

df_out_removed = getOutlierIndices(df_fire.copy())

In [None]:
def newBoxCoxTranformation1(df,target,testFile=False, log=False):
    
    #assuming that only numerical features are presented
    print("Shape of the dataset initial : ", df.shape)
    
    if not testFile:
        #outlier_indices = getOutlierIndices(df)
        df =df[df.SalePrice >0]
        
        #let us remove outliers
        '''df = grLivAreaOutlier(df)
        df = garageCarsOutlier(df)
        df = garageAreaOutlier(df)
        df = totalBsmtSFOutlier(df)'''
        
        #df = preProcessData(df.copy())

        
        print("Shape of the dataset before transformation : ", df.shape)
        y = np.array(df[target].apply( lambda x: math.log(x)))
        X= df.drop(target,axis = 1)
        x_columns = list(X)
        X = preprocessing.MinMaxScaler(feature_range=(1, 2)).fit_transform(X)
        X = pd.DataFrame(X, columns=x_columns)
        
        for c in list(X):
            if len(X[c].unique()) in  [1,2]:
                if log:
                    print("Skipping Transformation for ", c, "because unique values are :",X[c].unique())
            else:
                if log:
                    print("Boxcoxing : ", c)
                X[c] = stats.boxcox(X[c])[0]
        #X = preprocessing.MinMaxScaler(feature_range=(1, 2)).fit_transform(X)
        X = preprocessing.StandardScaler().fit_transform(X)
        #X = X.values
        print("Shape of the dataset after transformation : ", X.shape, y.shape)
        return X,y
    else:
        df = df[df.SalePrice == 0.0]
        #df = preProcessData(df.copy())
        print("Shape of the dataset before transformation : ", df.shape)
        X=df.drop(target,axis = 1)
        x_columns = list(X)
        X = preprocessing.MinMaxScaler(feature_range=(1, 2)).fit_transform(X)
        
        X = pd.DataFrame(X, columns=x_columns)
        for c in list(X):
            if len(X[c].unique()) in  [1,2]:
                if log:
                    print("Skipping Transformation for ", c, "because unique values are :",X[c].unique())
            else:
                if log:
                    print("Boxcoxing : ", c)
                X[c] = stats.boxcox(X[c])[0]
        
        
        #X = preprocessing.power_transform( X, method='box-cox')
        #X = preprocessing.MinMaxScaler(feature_range=(1, 2)).fit_transform(X)
        X = preprocessing.StandardScaler().fit_transform(X)
        #X = X.values
        print("Shape of the dataset after transformation : ", X.shape)
        return X

df_fire_b = df_fire.copy()
df_fire_b['SalePrice'] = df_fire_b['SalePrice'] / df_fire_b.GrLivArea
df_out_removed = getOutlierIndices(df_fire_b.copy())
df_prep = preProcessData(df_out_removed.copy())
X = newBoxCoxTranformation1(df_prep.copy(),'SalePrice',True,False)  
X,y = newBoxCoxTranformation1(df_prep.copy(),'SalePrice',False,False)

In [124]:
from IPython import display
def transformTarget(df_temp, revert=False ):
    '''df = df_temp.copy()
    #df['new_variable'] = df.BedroomAbvGr * .1 + df.FullBath *.25 + df.HalfBath * .5 + df.BsmtFullBath *.75 + df.BsmtHalfBath * 1
    #df['new_variable'] = df.BedroomAbvGr * .1 + df.FullBath *.25 + df.HalfBath * .5 + df.BsmtFullBath *.75 + df.BsmtHalfBath * .1
    df['new_variable'] = df.BedroomAbvGr.apply(lambda x: x if x > 0 else 1)
    #print("new_variable calculated")
    #display.display(df[df.new_variable.isna()])
    
    if not revert:
        df['sales_per_new'] = df['SalePrice']/ df.new_variable
        #display.display(df[df.sales_per_new.isna()])
        y = np.array(df['sales_per_new'].apply( lambda x: math.log(x)))
        return y
    
    df['sales_per_new'] = df['SalePrice']* df.new_variable
    #print("sales_per_new calculated")
    #display.display(df[df.sales_per_new.isna()])'''
    #return np.log( df_temp.SalePrice / df_temp.GrLivArea)
    #return np.log(df_temp['SalePrice'])
    return (df_temp['SalePrice'])
    

In [125]:
def newBoxCoxTranformation(df_temp,target,testFile=False, log=False):
    df = df_temp.copy()
    #assuming that only numerical features are presented
    if log:
        print("Shape of the dataset initial : ", df.shape)
    
    if not testFile:
        df =df[df.SalePrice >0]
        if log:
            print("Shape of the dataset before transformation : ", df.shape)
        
        #display.display(df[df['SalePrice'].isna()])
        y = transformTarget(df)
        X= df.drop([target],axis = 1)
        #X=df.drop(target, axis=1)
        x_columns = list(X)
        #print(x_columns)
        X = preprocessing.MinMaxScaler(feature_range=(1, 2)).fit_transform(X)
        #X_testx = preprocessing.MinMaxScaler(feature_range=(1, 2)).fit_transform(X,y)
        #X_testxx = preprocessing.MinMaxScaler(feature_range=(1, 2)).fit_transform(X,df.SalePrice)
        #print(np.unique(X == X_testx))
        #print(np.unique(X == X_testxx))
        #print(np.unique(X_testx == X_testxx))
        X = pd.DataFrame(X, columns=x_columns)
        
        for c in list(X):
            if len(X[c].unique()) in  [1,2]:
                if log:
                    print("Skipping Transformation for ", c, "because unique values are :",X[c].unique())
            else:
                if log:
                    print("Boxcoxing : ", c)
                X[c] = stats.boxcox(X[c])[0]
        
        X = preprocessing.MinMaxScaler(feature_range=(1, 2)).fit_transform(X)
        #X = X.values
        if log:
            print("Shape of the dataset after transformation : ", X.shape, y.shape)
        return X,y
    else:
        df = df[df.SalePrice == 0.0]
        if log:
            print("Shape of the dataset before transformation : ", df.shape)
        X=df.drop(target,axis = 1)
        x_columns = list(X)
        #print(x_columns)
        X = preprocessing.MinMaxScaler(feature_range=(1, 2)).fit_transform(X)
        
        X = pd.DataFrame(X, columns=x_columns)
        for c in list(X):
            if len(X[c].unique()) in  [1,2]:
                if log:
                    print("Skipping Transformation for ", c, "because unique values are :",X[c].unique())
            else:
                if log:
                    print("Boxcoxing : ", c)
                X[c] = stats.boxcox(X[c])[0]
        
        
        #X = preprocessing.power_transform( X, method='box-cox')
        X = preprocessing.MinMaxScaler(feature_range=(1, 2)).fit_transform(X)
        #X = X.values
        if log:
            print("Shape of the dataset after transformation : ", X.shape)
        return X
        
df_fire_b = df_fire.copy()
df_fire_b['SalePrice'] = df_fire_b['SalePrice'] / df_fire_b.GrLivArea
df_out_removed = getOutlierIndices(df_fire_b.copy())
df_prep = preProcessData(df_out_removed.copy())
X = newBoxCoxTranformation(df_prep.copy(),'SalePrice',True,False)  
X,y = newBoxCoxTranformation(df_prep.copy(),'SalePrice',False,False)

Shape before removing outlier :  (2919, 78)
Shape after removing outlier :  (2882, 78)
Shape of the data set before pre processing :  (2882, 78)


Shape of the data set after pre processing :  (2882, 341)


In [126]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.5, random_state=random.randint(1,500))

In [127]:
#reg = XGBRegressor(max_depth=4, n_estimators=200)
'''reg = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.25,
       colsample_bytree=0.5, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=2, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0.0, reg_lambda=0.0, scale_pos_weight=1, seed=None,
       silent=True, subsample=1) #0.13073788936978095'''

reg = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
       colsample_bytree=0.75, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=800,
       n_jobs=2, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0.5, reg_lambda=0.5, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

reg.fit(X_train,y_train)
reg.score(X_test,y_test)

0.7900031561961954

In [129]:
np.sqrt(mean_squared_log_error((y_test), (reg.predict(X_test))))
#np.sqrt(mean_squared_log_error(np.exp(y_test), np.exp(reg.predict(X_test))))

0.12427355827503385

In [130]:
reg

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
       colsample_bytree=0.75, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=800,
       n_jobs=2, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0.5, reg_lambda=0.5, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

In [132]:
def dummyCrossValidation(loop_count):
    for i in range(loop_count):
        X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.5, random_state=random.randint(1,500))
        reg = XGBRegressor(max_depth=3, n_estimators=100, reg_alpha=.5, reg_lambda=.5)
        reg.fit(X_train,y_train)
        
        print("R2 Score: ", reg.score(X_test,y_test))
        print("RMSLE Score : ", np.sqrt(mean_squared_log_error((y_test), (reg.predict(X_test)))))
        #print("RMSLE Score : ", np.sqrt(mean_squared_log_error(np.exp(y_test), np.exp(reg.predict(X_test)))))
dummyCrossValidation(10)

R2 Score:  0.7411838063406728
RMSLE Score :  0.13914474577656455
R2 Score:  0.7705217854286269
RMSLE Score :  0.128712985235048
R2 Score:  0.7659611731347808
RMSLE Score :  0.12708034580087846
R2 Score:  0.7399736620743083
RMSLE Score :  0.13700615329255592
R2 Score:  0.7469803417123881
RMSLE Score :  0.12838130514005583
R2 Score:  0.7296315714648065
RMSLE Score :  0.1374840966074223
R2 Score:  0.7597053047495568
RMSLE Score :  0.13633871273268358
R2 Score:  0.7755820591234065
RMSLE Score :  0.12845408728968877
R2 Score:  0.7802512465245441
RMSLE Score :  0.12594878460041573
R2 Score:  0.7487091405924081
RMSLE Score :  0.13357726690260732


In [133]:
reg.fit(X,y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
       colsample_bytree=0.75, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=800,
       n_jobs=2, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0.5, reg_lambda=0.5, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

In [141]:
def checkTheTrainFile(reg,df_prep):
    df_train = pd.read_csv(train_File)
    #df_tra['SalePrice'] = 0.0
    
    #df_train =  pd.read_csv(train_File)
    #df_concat = pd.concat([df_train,df_test])

    #print(df_test[df_test.TotalBsmtSF.isna()])
    #return
    #df = giveMeWrangledData(df_concat,True)
    
    #print(df.info())
    #df = preProcessData(df)
    #print(df.info())
    #dt = df_prep.copy()
    
    #df_prep = preProcessData(df_fire.copy())
    #dt = dt[dt.SalePrice > 0]
    df_prep1 = df_prep.copy()
    df_prep1 = df_prep1[df_prep1['SalePrice'] > 0]
    df_prep1['New_SalePrice'] = 0.0
    
    df_prep = df_prep[df_prep['SalePrice'] > 0]
    df_prep['SalePrice'] = 0
    
    X = newBoxCoxTranformation(df_prep.copy(),'SalePrice',True)
    #print(np.sqrt(mean_squared_log_error(y, reg.predict(X))))
    
    #df_prep1['New_SalePrice'] = list(np.exp(reg.predict(X)))
    df_prep1['New_SalePrice'] = reg.predict(X)
    df_prep1['New_SalePrice'] = df_prep1['New_SalePrice'] * df_prep1.GrLivArea
    df_prep1['SalePrice'] = df_prep1['SalePrice'] * df_prep1.GrLivArea
    
    
    df_train_score = df_prep1[df_prep1.SalePrice > 0]
    
    print(np.sqrt(mean_squared_log_error(df_train_score['SalePrice'], df_train_score['New_SalePrice'])))
    
    #return df_test, X, reg.predict(X)
#df_test, X_dummy, y_dummy= checkTheTestFile(reg)
checkTheTrainFile(reg,df_prep.copy())

0.031629483392693634


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [136]:
def checkTheTestFile(reg):
    df_test = pd.read_csv(test_File)
    df_test['SalePrice'] = 0.0
    print(df_test.shape)
    
    #df_train =  pd.read_csv(train_File)
    #df_concat = pd.concat([df_train,df_test])

    #print(df_test[df_test.TotalBsmtSF.isna()])
    #return
    #df = giveMeWrangledData(df_concat,True)
    
    #print(df.info())
    #df = preProcessData(df)
    #print(df.info())
    X = newBoxCoxTranformation(df_prep.copy(),'SalePrice',True)
    #print(np.sqrt(mean_squared_log_error(y, reg.predict(X))))
    df_test['SalePrice'] = (reg.predict(X))
    #df_test['SalePrice'] = np.exp(reg.predict(X))
    df_test['SalePrice'] = df_test['SalePrice'] * df_test.GrLivArea
    
    return df_test, X, reg.predict(X)
df_test, X_dummy, y_dummy= checkTheTestFile(reg)

(1459, 81)


In [137]:
df_test[['Id','SalePrice']].to_csv('submission.csv',index=False)

In [138]:
df_test[['Id','SalePrice']]

Unnamed: 0,Id,SalePrice
0,1461,136423.464844
1,1462,174365.282639
2,1463,203167.291168
3,1464,200823.291565
4,1465,190085.859375
5,1466,188352.843857
6,1467,173680.381607
7,1468,179601.184998
8,1469,204438.965378
9,1470,132822.651215


In [139]:
def forCrossValidationStratifiedShuffleSplit(df):
    sss = StratifiedShuffleSplit(n_splits=10, test_size=.5, random_state=1986)
    #print("Number of Splits configured :", sss.get_n_splits(df, df.BldgType))
    
    for train_index, test_index in sss.split(df, df.BldgType):
        yield train_index, test_index
        
    for train_index, test_index in sss.split(df, df.OverallQual):
        yield train_index, test_index

In [140]:
df_train = pd.read_csv(train_File)
def crossValidationScoring(reg,X,y):
    return -np.sqrt(mean_squared_log_error(np.exp(y), 
                                          np.exp(reg.predict(X))
                                          ))
    #return np.sqrt(mean_squared_log_error(np.exp(y), 
    #                                      np.exp(reg.predict(X
    #                                                        ))
    #                                      ))

    #return np.sqrt(mean_squared_log_error((y), 
    #                                      (reg.predict(X))
    #                                      ))
mean_temp_rmsle = np.mean(cross_val_score(reg,X,y,cv= 5,scoring='neg_mean_squared_log_error'))
print("RMSE with without target variable transformation : ", np.sqrt(mean_temp_rmsle * -1))

mean_temp_rmsle = np.mean(cross_val_score(reg, X, y,
                                          cv= 5,
                                          scoring=crossValidationScoring))
print("RMSE with post target variable transformation : ", mean_temp_rmsle)

RMSE with without target variable transformation :  0.11843599777284641


  after removing the cwd from sys.path.


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

from sklearn.model_selection import GridSearchCV
def gridSearchCVImp():
    start_time =datetime.datetime.now()
    reg = XGBRegressor(n_jobs =2, reg_alpha=.5, reg_lambda=.5, subsample=1)
    
    parameters = {
        'max_depth':list(range(3,4)),
        'colsample_bylevel' : np.arange(0.0, 1.0, 0.25),
        'n_estimators' : list(range(500,1500,100)),
        'colsample_bytree' : np.arange(0.0, 1.0, 0.25),
        #'reg_alpha': np.arange(0.0, 1.0, 0.25),
        #'reg_lambda': np.arange(0.0, 1.0, 0.25)
        }
    cv= ShuffleSplit(n_splits=5, test_size=.5, random_state=1986)
    reg_grid = GridSearchCV(reg, parameters, 
                            #cv=forCrossValidationStratifiedShuffleSplit(df_train),
                            cv=cv,
                            n_jobs = 2,
                            #scoring = 'neg_median_absolute_error',
                            #scoring = 'neg_mean_absolute_error',
                            #scoring = 'neg_mean_squared_log_error',
                            #scoring = crossValidationScoring,
                            verbose=1,
                            #error_score ='raise'
                            error_score =5
                            #pre_dispatch = 2
                           )
    reg_grid.fit(X,y)
    
    print("Total time for the gridserach", datetime.datetime.now() - start_time)
    
    return reg_grid
reg_grid = gridSearchCVImp()
print(reg_grid.best_estimator_)
print(reg_grid.best_score_)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
       colsample_bytree=0.75, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=2, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0.5, reg_lambda=0.5, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)
0.8998707938479593