In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
trainset = pd.read_csv("train.csv")
testset = pd.read_csv("test.csv")


In [3]:
train_id = trainset.Id
test_id = testset.Id
train_salesprice = trainset.SalePrice

trainset.drop("Id", axis=1, inplace=True)
testset.drop("Id", axis=1, inplace=True)


In [4]:
data = pd.concat([trainset.drop(['SalePrice'], axis=1), testset])
data.head()


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [5]:
len(data)

2919

In [25]:
#Now we check for the null values
all_data_na = (data.isnull().sum() / len(data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)

Unnamed: 0,Missing Ratio


In [9]:
data['PoolQC']=data['PoolQC'].fillna('None')

In [10]:
data['MiscFeature']=data['MiscFeature'].fillna('None')

In [11]:
data['Alley']=data['Alley'].fillna('None')

In [12]:
data['Fence']=data['Fence'].fillna('None')

In [13]:
data['FireplaceQu']=data['FireplaceQu'].fillna('None')


In [14]:
#Here if the lot frontage in NA doesnt mean it does not exist, so we group it by neighbourhood 
data.LotFrontage = data.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))

In [15]:
data['GarageFinish']=data['GarageFinish'].fillna('None')


In [16]:
for column in ('GarageType','GarageQual', 'GarageCond'):
    data[column] = data[column].fillna('None')

In [17]:
    
for column in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    data[column] = data[column].fillna(0)
#We will do the same for all the basement features since many houses do not have the features of a basement
for column in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    data[column] = data[column].fillna(0)
    
for column in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    data[column] = data[column].fillna('None')

In [18]:
data['MasVnrType']=data['MasVnrType'].fillna('None')
data['MasVnrArea']=data['MasVnrArea'].fillna(0)


In [19]:
data['MSZoning']= data['MSZoning'].fillna(data['MSZoning'].mode()[0])

In [21]:
data['Functional']=data['Functional'].fillna('Typ')


In [22]:
data['Utilities']= data['Utilities'].fillna(data['Utilities'].mode()[0])

In [23]:
data['SaleType']= data['SaleType'].fillna(data['SaleType'].mode()[0])

In [24]:
data['KitchenQual']= data['KitchenQual'].fillna(data['KitchenQual'].mode()[0])
data['Electrical']= data['Electrical'].fillna(data['Electrical'].mode()[0])
data['Exterior2nd']= data['Exterior2nd'].fillna(data['Exterior2nd'].mode()[0])
data['Exterior1st']= data['Exterior1st'].fillna(data['Exterior1st'].mode()[0])


In [26]:
#Now since we have handled all our missing values we will move on to Label Encoding our Categorical data

from sklearn.preprocessing import LabelEncoder

#Storing our categorical columns
categorical_columns = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street'
, 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')


#Process columns, apply LabelEncoder to categorical features
for column in categorical_columns:
    lbl = LabelEncoder() 
    lbl.fit(list(data[column].values)) 
    data[column] = lbl.transform(list(data[column].values))

# shape of our dataset
print('Shape all_data: {}'.format(data.shape))

Shape all_data: (2919, 79)


In [27]:
#There are many numerical variables, which are actually categorical
#To handle this anomally we will convert these numerical data to strings

#MSSubClass=The building class
data['MSSubClass'] = data['MSSubClass'].apply(str)


#Changing OverallCond into a categorical variable
data['OverallCond'] = data['OverallCond'].astype(str)


#Year and month sold are transformed into categorical features.
data['YrSold'] = data['YrSold'].astype(str)
data['MoSold'] = data['MoSold'].astype(str)

In [28]:
data= pd.get_dummies(data)
data

Unnamed: 0,LotFrontage,LotArea,Street,Alley,LotShape,LandSlope,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,65.0,8450,1,1,3,0,7,2003,2003,196.0,...,0,0,0,1,0,0,0,0,1,0
1,80.0,9600,1,1,3,0,6,1976,1976,0.0,...,0,0,0,1,0,0,0,0,1,0
2,68.0,11250,1,1,0,0,7,2001,2002,162.0,...,0,0,0,1,0,0,0,0,1,0
3,60.0,9550,1,1,0,0,7,1915,1970,0.0,...,0,0,0,1,1,0,0,0,0,0
4,84.0,14260,1,1,0,0,8,2000,2000,350.0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,21.0,1936,1,1,3,0,4,1970,1970,0.0,...,0,0,0,1,0,0,0,0,1,0
1455,21.0,1894,1,1,3,0,4,1970,1970,0.0,...,0,0,0,1,1,0,0,0,0,0
1456,160.0,20000,1,1,3,0,5,1960,1996,0.0,...,0,0,0,1,1,0,0,0,0,0
1457,62.0,10441,1,1,3,0,5,1992,1992,0.0,...,0,0,0,1,0,0,0,0,1,0


In [29]:
x= data[:1460]
test= data[1460:]
y=train_salesprice

In [30]:
#Splitting data into training and test sets


from sklearn.model_selection import train_test_split 

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [32]:
from xgboost import XGBRegressor 
model_xgb = XGBRegressor()
model_xgb.fit(x_train, y_train)


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [37]:
y_pred= model_xgb.predict(test)
testset['SalePrice']=y_pred
testset['Id']=test_id
testset[['Id', 'SalePrice']].to_csv('xgb_submission.csv', index=False)