In [292]:
import pandas as pd
import matplotlib.pyplot as plt

In [293]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [375]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [378]:
train_features = train.drop(['SalePrice','Id'],axis=1)
test = test.drop(['Id'],axis=1)

In [379]:
train_features.shape

(1460, 79)

In [380]:
test.shape

(1459, 79)

# Deal with missing values

In [381]:
def DropMissingValue(data:pd,threshold = 0.3) -> pd: # drop missing value that exceed threshold of number of observations
    df_null = data.isnull().sum()
    null_index = df_null[df_null >= threshold*data.shape[0]].index
    data = data.drop(null_index,axis=1)
    return data

In [382]:
train_new = DropMissingValue(train_features)
test_new = DropMissingValue(test)

In [383]:
train_new.columns == test_new.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

# Fill nan with mean for float numbers except for year,fill nan with mode for categorical values

In [384]:
train_null = train_new.isnull().sum().sort_values(ascending=False) != 0
train_null_idx = train_null[train_null].index

In [385]:
test_null = test_new.isnull().sum().sort_values(ascending=False) != 0
test_null_idx = test_null[test_null].index

In [386]:
train_new[train_null_idx].dtypes

LotFrontage     float64
GarageFinish     object
GarageType       object
GarageCond       object
GarageQual       object
GarageYrBlt     float64
BsmtExposure     object
BsmtFinType2     object
BsmtFinType1     object
BsmtCond         object
BsmtQual         object
MasVnrArea      float64
MasVnrType       object
Electrical       object
dtype: object

In [387]:
train_null_idx

Index(['LotFrontage', 'GarageFinish', 'GarageType', 'GarageCond', 'GarageQual',
       'GarageYrBlt', 'BsmtExposure', 'BsmtFinType2', 'BsmtFinType1',
       'BsmtCond', 'BsmtQual', 'MasVnrArea', 'MasVnrType', 'Electrical'],
      dtype='object')

In [388]:
train_new[['LotFrontage','MasVnrArea']] = train_new[['LotFrontage','MasVnrArea']].fillna(train_new.mean().iloc[0])

In [389]:
train_new[[ 'GarageCond', 'GarageType', 'GarageYrBlt','GarageFinish', 'GarageQual', 'BsmtFinType2', 'BsmtExposure',
           'BsmtFinType1', 'BsmtCond', 'BsmtQual', 'MasVnrType','Electrical']] = train_new[[ 'GarageCond', 'GarageType', 'GarageYrBlt','GarageFinish', 'GarageQual', 'BsmtFinType2', 'BsmtExposure',
           'BsmtFinType1', 'BsmtCond', 'BsmtQual', 'MasVnrType','Electrical']].fillna(train_new.mode().iloc[0])

In [390]:
test_null_idx

Index(['LotFrontage', 'GarageFinish', 'GarageCond', 'GarageQual',
       'GarageYrBlt', 'GarageType', 'BsmtCond', 'BsmtQual', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinType2', 'MasVnrType', 'MasVnrArea', 'MSZoning',
       'Functional', 'BsmtHalfBath', 'BsmtFullBath', 'Utilities',
       'Exterior2nd', 'Exterior1st', 'KitchenQual', 'TotalBsmtSF',
       'GarageCars', 'SaleType', 'BsmtUnfSF', 'GarageArea', 'BsmtFinSF2',
       'BsmtFinSF1'],
      dtype='object')

In [391]:
test_new[['LotFrontage','MasVnrArea','BsmtHalfBath','BsmtFullBath','BsmtUnfSF','BsmtFinSF2','BsmtFinSF1','TotalBsmtSF',
          'GarageArea']] = test_new[['LotFrontage','MasVnrArea','BsmtHalfBath','BsmtFullBath','BsmtUnfSF','BsmtFinSF2','BsmtFinSF1','TotalBsmtSF',
          'GarageArea']].fillna(train_new.mean().iloc[0])

In [392]:
test_new[[ 'GarageCond', 'GarageQual', 'GarageYrBlt',
       'GarageFinish', 'GarageType', 'BsmtCond', 'BsmtQual', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinType2', 'MasVnrType', 'MSZoning', 'Utilities', 'Functional', 'BsmtUnfSF',
       'SaleType', 'Exterior2nd', 'Exterior1st', 'GarageCars', 'KitchenQual']] = test_new[[ 'GarageCond', 'GarageQual', 'GarageYrBlt',
       'GarageFinish', 'GarageType', 'BsmtCond', 'BsmtQual', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinType2', 'MasVnrType', 'MSZoning', 'Utilities', 'Functional', 'BsmtUnfSF',
       'SaleType', 'Exterior2nd', 'Exterior1st', 'GarageCars', 'KitchenQual']].fillna(train_new.mode().iloc[0])

# Categorical features

In [393]:
df = pd.concat([train_new,test_new])  #Combine train and test data

In [394]:
def Dummies(data:pd) -> pd:  # dummy all categorical features
    object_features = data.dtypes[df.dtypes == object].index
    dummies = pd.get_dummies(data[object_features])
    df_new = pd.concat([data,dummies],axis=1)
    df_new.drop(columns=object_features,inplace=True)
    df_new = df_new.loc[:,~df_new.columns.duplicated()]
    return df_new

In [395]:
df_new = Dummies(df)

In [396]:
df_new.shape

(2919, 270)

# Split the train and test back

In [397]:
train_new = df_new.iloc[:1460,:]
test_new = df_new.iloc[1460:,:]

In [398]:
X = train_new
y = train['SalePrice']

# Build model

In [399]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [400]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [401]:
import xgboost as xgb

In [402]:
xgb_model = xgb.XGBRegressor(learning_rate=0.1,max_depth=3)

In [403]:
xgb_model.fit(X_train,y_train)

  if getattr(data, 'base', None) is not None and \




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [404]:
preds = xgb_model.predict(X_test)

In [405]:
RMSE = sqrt(mean_squared_error(y_test,preds))

In [406]:
print(RMSE)

27470.507458542867


# Predict Test dataset

In [407]:
sub = pd.read_csv('sample_submission.csv')

In [408]:
sub.head()

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


In [409]:
saleprice = xgb_model.predict(test_new )

In [410]:
sub['SalePrice'] = saleprice

In [411]:
sub.to_csv('submission.csv',index=False)

In [412]:
sub.head()

Unnamed: 0,Id,SalePrice
0,1461,122242.242188
1,1462,154532.75
2,1463,179697.90625
3,1464,186300.578125
4,1465,200985.0625
