# Linear Regression on House Prices with XGBoost

In [1]:
keras_actif = False # Active Keras ou XGBoost
if keras_actif:
    xgb_actif = False
else:
    xgb_actif = True

## Preprocess data

In [2]:
import pandas as pd
import numpy as np
from numpy import transpose
from numpy import append
from numpy import reshape
import matplotlib.pyplot as plt
# %matplotlib notebook
import seaborn as sns

np.random.seed(42)
# tf.random.set_seed(42)
pd.set_option('display.max_columns', 100)

In [3]:
data = pd.read_csv('sources/train.csv')
data.drop('Id',axis = 1, inplace = True)
data.fillna(0, inplace=True)
data_num = data.select_dtypes(exclude=['object'])
data_cat = data.select_dtypes(include=['object'])

data_test = pd.read_csv('sources/test.csv')
data_test_id = data_test['Id']
data_test.drop('Id',axis = 1, inplace = True)
data_test.fillna(0, inplace=True)
data_test_num = data_test.select_dtypes(exclude=['object'])
data_test_cat = data_test.select_dtypes(include=['object'])

# columns where NaN values have meaning e.g. no pool etc.
cols_fillna = ['PoolQC','MiscFeature','Alley','Fence','MasVnrType','FireplaceQu',
               'GarageQual','GarageCond','GarageFinish','GarageType', 'Electrical',
               'KitchenQual', 'SaleType', 'Functional', 'Exterior2nd', 'Exterior1st',
               'BsmtExposure','BsmtCond','BsmtQual','BsmtFinType1','BsmtFinType2',
               'MSZoning', 'Utilities']

# replace 'NaN' with 'None' in these columns
for col in cols_fillna:
    data[col].fillna('None',inplace=True)
    data_test[col].fillna('None',inplace=True)

# Convert categorical variable into dummy/indicator variables.
data_cat = pd.get_dummies(data_cat)
data_test_cat = pd.get_dummies(data_test_cat)

print(f"Shape data : {data.shape}")
print(f"Shape data_num : {data_num.shape}")
print(f"Shape data_cat après get_dummies : {data_cat.shape}")
# print('')
# print(f"Features numériques : {list(data_num.columns)}")
# print('')
# print(f"Features catégorielles : {list(data_cat.columns)}")
print('')
print(f"Shape data_test : {data_test.shape}")
print(f"Shape data_test_num : {data_test_num.shape}")
print(f"Shape data_test_cat après get_dummies : {data_test_cat.shape}")
# print('')
# print(f"Features numériques : {list(data_test_num.columns)}")
# print('')
# print(f"Features catégorielles : {list(data_test_cat.columns)}")
# print('')

Shape data : (1460, 80)
Shape data_num : (1460, 37)
Shape data_cat après get_dummies : (1460, 268)

Shape data_test : (1459, 79)
Shape data_test_num : (1459, 36)
Shape data_test_cat après get_dummies : (1459, 256)


### Delete outliers

In [4]:
# Suppression des outliers
remove_outliers = True
if remove_outliers:
    from sklearn.ensemble import IsolationForest
    
    anomalies_ratio = 0.01
    clf = IsolationForest(contamination = anomalies_ratio,
                          behaviour= " new",
                          random_state = 42)

    clf.fit(data_num)
    y_noano = clf.predict(data_num)
    
    y_noano = pd.DataFrame(y_noano, columns = ['Top'])
    y_noano[y_noano['Top'] == 1].index.values

    data_num = data_num.iloc[y_noano[y_noano['Top'] == 1].index.values]
    data_num.reset_index(drop = True, inplace = True)
    
    data_cat = data_cat.iloc[y_noano[y_noano['Top'] == 1].index.values]
    data_cat.reset_index(drop = True, inplace = True)
    print("Number of Outliers:", y_noano[y_noano['Top'] == -1].shape[0])
    print("Number of rows without outliers:", data_num.shape[0])

Number of Outliers: 15
Number of rows without outliers: 1445


In [5]:
data[data['SaleCondition'] == 'Partial'].shape[0]

125

### Merge Num & Cat Dataset

In [6]:
# data = pd.concat([data_num, data_cat], ignore_index=False, axis=1)
data = pd.concat([data_num, data_cat], ignore_index=False, axis=1)
data_test = pd.concat([data_test_num, data_test_cat], ignore_index=False, axis=1)
print(f"Shape data : {data.shape}")
print(f"Shape data_test : {data_test.shape}")

Shape data : (1445, 305)
Shape data_test : (1459, 292)


### Add missing columns to data_test

In [7]:
# Reshape colonne similaire
missing_col = list(set(data.columns) - set(data_test.columns))
print(missing_col)
print('')

for col in missing_col:
    data_test[col] = 0
data_test = data_test[data.columns]

print(f"Shape data : {data.shape}")
print(f"Shape data_test : {data_test.shape}")
print('')
missing_col = list(set(data.columns) - set(data_test.columns))
print(missing_col)

['RoofMatl_Metal', 'Exterior1st_Stone', 'Condition2_RRAe', 'Exterior1st_ImStucc', 'Utilities_NoSeWa', 'SalePrice', 'PoolQC_Fa', 'HouseStyle_2.5Fin', 'Condition2_RRNn', 'Heating_Floor', 'Electrical_0', 'RoofMatl_Membran', 'Exterior2nd_Other', 'MiscFeature_TenC', 'RoofMatl_ClyTile', 'Condition2_RRAn', 'RoofMatl_Roll', 'GarageQual_Ex', 'Electrical_Mix', 'Heating_OthW']

Shape data : (1445, 305)
Shape data_test : (1459, 305)

[]


### Extract SalePrice from data

In [8]:
if 'SalePrice' in data.columns:
    Y_train = np.log(data['SalePrice']).to_numpy()
    Y_train = Y_train.reshape(Y_train.shape[0],1)
    del data['SalePrice']
    del data_test['SalePrice']

X_train = data.to_numpy()
X_test = data_test.to_numpy()

print(f"X train {X_train.shape}")
print(f"Y train {Y_train.shape}")
print(f"X test {X_test.shape}")

X train (1445, 304)
Y train (1445, 1)
X test (1459, 304)


### Scale features

"Normalizing" the data should help prevent values from "exploding":

In [9]:
scale_data = True
if scale_data:
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    scaler_Y = scaler.fit(Y_train)
    Y_train = scaler_Y.transform(Y_train)
    
print(f"X train {X_train.shape}")
print(f"Y train {Y_train.shape}")
print(f"X test {X_test.shape}")

X train (1445, 304)
Y train (1445, 1)
X test (1459, 304)


## Modelling

### Keras

In [10]:
if keras_actif:
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Input
    from tensorflow.keras.layers import Dense
    model = Sequential()
    model = Sequential([
        Input(shape=X_train.shape[1:]),
        Dense(28, activation='sigmoid'),
        Dense(14, activation='sigmoid'),
        Dense(1)
    ])
    print(model.summary())
    
    from tensorflow.keras.optimizers import Adam
    model.compile(
        loss='mean_squared_error',
        optimizer=Adam(learning_rate=0.00014, beta_1=0.9, beta_2=0.999, amsgrad=False)
    )

    BATCH_SIZE = 13 #
    EPOCHS = 300 # how many iterations over the whole dataset
    history = model.fit(X_train, Y_train, epochs=EPOCHS, batch_size=BATCH_SIZE)

    hist = pd.DataFrame(history.history)
    sns.relplot(x=hist.index, y='loss', kind='line', data=hist)
    
# BigML
# "network1": {
#     "learn_residuals": false,
#     "dropout_rate": 0,
#     "activation_functions": [
#         "sigmoid",
#         "sigmoid"
#     ],
#     "layer_sizes": [
#         28,
#         14
#     ],
#     "descent_algorithm": "adam",
#     "learning_rate": 0.00014,
#     "batch_size": 13,
#     "seed": "0-1-2-3-4-5-6-7-8-9-10-cand-1990",
#     "beta2": 0.999,
#     "beta1": 0.9,
#     "epsilon": 0,
#     "tree_embedding": false,
#     "max_training_time": 38964.64948,
#     "batch_normalization": false,
#     "outputs": 1
# },

### XGBRegressor

In [15]:
if xgb_actif:
    import xgboost as xgb
    
    optimisation_gridsearchCV = False
    if optimisation_gridsearchCV:
        from sklearn.model_selection import GridSearchCV
        gb_model = xgb.XGBRegressor(objective='reg:squarederror')
        params = {
            'kernel':('linear', 'poly', 'poly', 'rbf', 'rbf'),
            'min_child_weight':[4,5],
            'gamma':[i/10.0 for i in range(3,6)],
            'subsample':[i/10.0 for i in range(6,11)],
            'colsample_bytree':[i/10.0 for i in range(6,11)],
            'max_depth': [2,3,4]
        }

        xgb_model = GridSearchCV(xgb_model, params)
        xgb_model.fit(X_train, Y_train)

        xgb_model.best_estimator_
    else:
        xgb_model = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0.3,
             importance_type='gain', kernel='linear', learning_rate=0.1,
             max_delta_step=0, max_depth=3, min_child_weight=4, missing=None,
             n_estimators=100, n_jobs=1, nthread=None,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
             subsample=0.8, verbosity=1)
        
        xgb_model.fit(X_train, Y_train)
    
    optimisation_flow = True
    if optimisation_flow:
        from sklearn import metrics
        from sklearn.model_selection import train_test_split
        X_split_train,X_split_test, y_split_train, y_split_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)
        xgb_split = xgb.XGBRegressor()
        XgBoost = xgb_split.fit(X_split_train, y_split_train)
        Y_train_pred_80 = xgb_split.predict(X_split_train) # Prediction sur train (80%)
        Y_train_pred_20 = xgb_split.predict(X_split_test) # Prediction sur val (20%)

        print(' > Results on train :')
#         print(' >> RMSLE : ', np.sqrt(metrics.mean_squared_log_error(y_split_train, Y_train_pred_80)))
        print(' >> MAE : ', metrics.mean_absolute_error(y_split_train, Y_train_pred_80))
        print()

        print (' > Results on val :')
#         print(' >> RMSLE : ', np.sqrt(metrics.mean_squared_log_error(y_split_test, Y_train_pred_20)))
        print(' >> MAE : ', metrics.mean_absolute_error(y_split_test, Y_train_pred_20))
        print()    

 > Results on train :
 >> MAE :  0.14452049727628635

 > Results on val :
 >> MAE :  0.2131217520585995



## Prepare for Kaggle submisison

In [12]:
if keras_actif:
    Y_test = model.predict(x=X_test)
if xgb_actif:
    Y_test = xgb_model.predict(X_test)

Y_test = np.exp(scaler_Y.inverse_transform(Y_test))
Y_test = np.reshape(Y_test, Y_test.shape[0],)

In [13]:
data_test['Id'] = data_test_id
data_test['SalePrice'] = Y_test
data_test.drop(data_test.columns.difference(['Id','SalePrice']), 1, inplace=True)
data_test.to_csv('storage/allfeatures_.csv', index=False)