In [292]:
import pandas as pd
import matplotlib.pyplot as plt

In [293]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [294]:
train_features = train.drop(['SalePrice'],axis=1)

In [295]:
train_features.shape

(1460, 80)

In [191]:
test.shape

(1459, 80)

# Deal with missing values

In [192]:
def DropMissingValue(data:pd,threshold = 0.3) -> pd: # drop missing value that exceed threshold of number of observations
    df_null = data.isnull().sum()
    null_index = df_null[df_null >= threshold*data.shape[0]].index
    data = data.drop(null_index,axis=1)
    return data

In [193]:
train_new = DropMissingValue(train_features)
test_new = DropMissingValue(test)

In [194]:
train_new.columns == test_new.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

# Fill nan with mean for float numbers except for year,fill nan with mode for categorical values

In [195]:
train_null = train_new.isnull().sum().sort_values(ascending=False) != 0
train_null_idx = train_null[train_null].index

In [196]:
test_null = test_new.isnull().sum().sort_values(ascending=False) != 0
test_null_idx = test_null[test_null].index

In [197]:
train_new[train_null_idx].dtypes

LotFrontage     float64
GarageCond       object
GarageType       object
GarageYrBlt     float64
GarageFinish     object
GarageQual       object
BsmtFinType2     object
BsmtExposure     object
BsmtFinType1     object
BsmtCond         object
BsmtQual         object
MasVnrType       object
MasVnrArea      float64
Electrical       object
dtype: object

In [198]:
train_null_idx

Index(['LotFrontage', 'GarageCond', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageQual', 'BsmtFinType2', 'BsmtExposure',
       'BsmtFinType1', 'BsmtCond', 'BsmtQual', 'MasVnrType', 'MasVnrArea',
       'Electrical'],
      dtype='object')

In [199]:
train_new[['LotFrontage','MasVnrArea']] = train_new[['LotFrontage','MasVnrArea']].fillna(train_new.mean().iloc[0])

In [200]:
train_new[[ 'GarageCond', 'GarageType', 'GarageYrBlt','GarageFinish', 'GarageQual', 'BsmtFinType2', 'BsmtExposure',
           'BsmtFinType1', 'BsmtCond', 'BsmtQual', 'MasVnrType','Electrical']] = train_new[[ 'GarageCond', 'GarageType', 'GarageYrBlt','GarageFinish', 'GarageQual', 'BsmtFinType2', 'BsmtExposure',
           'BsmtFinType1', 'BsmtCond', 'BsmtQual', 'MasVnrType','Electrical']].fillna(train_new.mode().iloc[0])

In [201]:
test_null_idx

Index(['LotFrontage', 'GarageCond', 'GarageQual', 'GarageYrBlt',
       'GarageFinish', 'GarageType', 'BsmtCond', 'BsmtQual', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinType2', 'MasVnrType', 'MasVnrArea', 'MSZoning',
       'BsmtHalfBath', 'Utilities', 'Functional', 'BsmtFullBath', 'BsmtUnfSF',
       'SaleType', 'BsmtFinSF2', 'BsmtFinSF1', 'Exterior2nd', 'Exterior1st',
       'TotalBsmtSF', 'GarageCars', 'KitchenQual', 'GarageArea'],
      dtype='object')

In [202]:
test_new[['LotFrontage','MasVnrArea','BsmtHalfBath','BsmtFullBath','BsmtUnfSF','BsmtFinSF2','BsmtFinSF1','TotalBsmtSF',
          'GarageArea']] = test_new[['LotFrontage','MasVnrArea','BsmtHalfBath','BsmtFullBath','BsmtUnfSF','BsmtFinSF2','BsmtFinSF1','TotalBsmtSF',
          'GarageArea']].fillna(train_new.mean().iloc[0])

In [203]:
test_new[[ 'GarageCond', 'GarageQual', 'GarageYrBlt',
       'GarageFinish', 'GarageType', 'BsmtCond', 'BsmtQual', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinType2', 'MasVnrType', 'MSZoning', 'Utilities', 'Functional', 'BsmtUnfSF',
       'SaleType', 'Exterior2nd', 'Exterior1st', 'GarageCars', 'KitchenQual']] = test_new[[ 'GarageCond', 'GarageQual', 'GarageYrBlt',
       'GarageFinish', 'GarageType', 'BsmtCond', 'BsmtQual', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinType2', 'MasVnrType', 'MSZoning', 'Utilities', 'Functional', 'BsmtUnfSF',
       'SaleType', 'Exterior2nd', 'Exterior1st', 'GarageCars', 'KitchenQual']].fillna(train_new.mode().iloc[0])

# Categorical features

In [235]:
df = pd.concat([train_new,test_new])  #Combine train and test data

In [252]:
def Dummies(data:pd) -> pd:  # dummy all categorical features
    object_features = data.dtypes[df.dtypes == object].index
    dummies = pd.get_dummies(data[object_features])
    df_new = pd.concat([data,dummies],axis=1)
    df_new.drop(columns=object_features,inplace=True)
    df_new = df_new.loc[:,~df_new.columns.duplicated()]
    return df_new

In [253]:
df_new = Dummies(df)

In [254]:
df_new.shape

(2919, 271)

# Split the train and test back

In [307]:
train_new = df_new.iloc[:1460,:]
test_new = df_new.iloc[1460:,:]

In [309]:
X = train_new
y = train['SalePrice']

# Build model

In [315]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [311]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [288]:
import xgboost as xgb

In [352]:
xgb_model = xgb.XGBRegressor(learning_rate=0.1,max_depth=3)

In [353]:
xgb_model.fit(X_train,y_train)

  if getattr(data, 'base', None) is not None and \




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [354]:
preds = xgb_model.predict(X_test)

In [355]:
RMSE = sqrt(mean_squared_error(y_test,preds))

In [356]:
print(RMSE)

32994.42865833424


# Predict Test dataset

In [366]:
sub = pd.read_csv('sample_submission.csv')

In [367]:
sub.head()

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


In [368]:
saleprice = xgb_model.predict(test_new )

In [369]:
sub['SalePrice'] = saleprice

In [373]:
sub.to_csv('submission.csv',index=False)

In [374]:
sub.head()

Unnamed: 0,Id,SalePrice
0,1461,122315.054688
1,1462,170378.546875
2,1463,182998.390625
3,1464,182657.703125
4,1465,188864.359375
