In [215]:
from sklearn.model_selection import train_test_split
from pandas import DataFrame, read_csv
from sklearn import metrics
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.preprocessing import normalize, StandardScaler

import xgboost
from xgboost import plot_importance, XGBRegressor

import numpy as np
from math import sqrt

%matplotlib inline

In [216]:
project = 'house-prices'
version='v0.1'

In [217]:
fulltrain=read_csv('./train.csv',index_col=0)
test=read_csv('./test.csv',index_col=0)
fixed_seed=1234578416
train80, valid20 = train_test_split(fulltrain, test_size=0.2, random_state=fixed_seed)

In [218]:
train80_qual = train80.select_dtypes(exclude=np.number)
valid20_qual = valid20.select_dtypes(exclude=np.number)
test_qual = test.select_dtypes(exclude=np.number)
train80 = train80.select_dtypes(include=np.number)
valid20 = valid20.select_dtypes(include=np.number)
test = test.select_dtypes(include=np.number)

useless_fields=['MoSold', 'YrSold']

train80.drop(useless_fields, axis=1, inplace=True)
valid20.drop(useless_fields, axis=1, inplace=True)
test.drop(useless_fields, axis=1, inplace=True)

In [219]:
train80.columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'SalePrice'],
      dtype='object')

In [220]:
train80.loc[:,'SalePrice']=np.log(train80.loc[:,'SalePrice'])
valid20.loc[:,'SalePrice']=np.log(valid20.loc[:,'SalePrice'])

In [221]:
mean_price_neigh={}
for name in list(train80_qual.Neighborhood.unique()):
    mean_price_neigh[name]=train80.loc[train80_qual['Neighborhood']==name,'SalePrice'].mean()
    
for name in list(train80_qual.Neighborhood.unique()):
    train80.loc[train80_qual['Neighborhood']==name,'mean_price']=mean_price_neigh[name]
    valid20.loc[valid20_qual['Neighborhood']==name,'mean_price']=mean_price_neigh[name]
    test.loc[test_qual['Neighborhood']==name,'mean_price']=mean_price_neigh[name]

# train80_qual['Neighborhood']
#mean_price_neigh

In [222]:
mean = {}
mode = {}
median ={}
for column in list(train80.columns):
    mean[column]=train80[column].mean()
    mode[column]=train80[column].mode()
    median[column]=train80[column].median()

In [223]:
target_column='SalePrice'
X_train = train80.drop(target_column, axis=1)
y_train = train80[target_column]
X_val = valid20.drop(target_column, axis=1)
y_val = valid20[target_column]

Surface Area

In [224]:
# normalization
scaler = StandardScaler()
scaler.fit(X_train)

    
for df in [X_train, X_val, test]:
    #for column in list(train80.columns):
    #    df.fillna(0, inplace=True)        
    scaler.transform(df, copy=False)

In [225]:
for df in [X_train, X_val, test]:
    df['Surface1'] =  df['2ndFlrSF'] * 0.67918 + df['1stFlrSF'] * 0.15835 + df['TotalBsmtSF'] * 0.56624
    df['Surface2'] =  df['1stFlrSF'] * 0.51 + df['TotalBsmtSF'] * 0.35
#    df.drop('2ndFlrSF', axis=1, inplace=True)
#    df.drop('1stFlrSF', axis=1, inplace=True)
#    df.drop('TotalBsmtSF', axis=1, inplace=True)
    df.drop('GrLivArea', axis=1, inplace=True)

In [226]:
model = XGBRegressor(booster="gbtree")

In [227]:
model.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

In [228]:
y_pred = model.predict(X_val)

In [229]:
RMSLE = sqrt(mean_squared_error(y_val,y_pred))
RMSLE

0.13413262961828934

In [230]:
test_pred = model.predict(test)

In [231]:
submission = DataFrame({"Id":test.index, "SalePrice": np.exp (test_pred)})