# The XGBoost 1 feature regression

## Imports

### Py Imports

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sbn
import sklearn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
import pickle
import os.path as path

### Datasets import

In [2]:
train = pd.read_csv('./sources/clean_train.csv', index_col=0)
test = pd.read_csv('./sources/clean_test.csv', index_col=0)

### Load variables

In [3]:
with open('./jar/columns.pkl', 'rb') as col:
    num_col, cat_col = pickle.load(col)

## work on dataset

### Scale datas

In [4]:
Y_train_full = train['SalePrice']
X_train_full = train.drop('SalePrice', axis=1)

In [5]:
train.shape

(1457, 221)

In [6]:
X_train_full.shape

(1457, 220)

In [7]:
Y_train_full.shape

(1457,)

In [8]:
Y_train_full

Id
1       12.247694
2       12.109011
3       12.317167
4       11.849398
5       12.429216
          ...    
1456    12.072541
1457    12.254863
1458    12.493130
1459    11.864462
1460    11.901583
Name: SalePrice, Length: 1457, dtype: float64

In [9]:
# Define a function to reduce dataframe with one 'Surface' feature

def set_one_feature(dataframe):
    dataframe['Surface'] = dataframe['GrLivArea'] + dataframe['TotalBsmtSF']
    return dataframe

In [10]:
set_one_feature(X_train_full)
X_train_full = X_train_full.Surface.values.astype(float)
X_train_full = np.array(X_train_full).reshape((-1, 1))

In [11]:
X_test = set_one_feature(test)
type(X_test)

pandas.core.frame.DataFrame

In [12]:
X_test = X_test.Surface.values.astype(float)
X_test = np.array(X_test).reshape((-1, 1))

In [13]:
Y_train_full = train.SalePrice.values.astype(float)
Y_train_full = np.array(Y_train_full).reshape((-1, 1))

In [14]:
Y_train_full.shape

(1457, 1)

In [15]:
X_train_full.shape

(1457, 1)

Normalize Data

In [16]:
X_train, X_val, y_train_log, y_val_log = train_test_split(X_train_full, Y_train_full, test_size=0.2, random_state=42)

## WARNING ##
y_train = np.exp(y_train_log)
y_val = np.exp(y_val_log)

# Save in Pickle for further use

with open('./jar/train-test.pkl', 'wb') as tts:
    pickle.dump((X_train, X_val, y_train, y_val), tts)

In [19]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

scaler_Y = StandardScaler()
scaler_Y.fit(y_train_log)
y_train_log_scaled = scaler_Y.transform(y_train_log)

## Model building

Split the train set into train and validation sets

In [20]:
# Set model with best params

model = xgb.XGBRegressor(objective='reg:squarederror')

if path.isfile('./jar/best-model.pkl'):
    with open('./jar/best-model.pkl', 'rb') as best_model:
        model = pickle.load(best_model)
else:
    # Find the best suited model

    model = GridSearchCV(model,
                         cv=5,
                         param_grid={
                             'kernel': ('linear',),
                             'min_child_weight': [4, 5],
                             'gamma': [i/10.0 for i in range(3, 6)],
                             'subsample': [i/10.0 for i in range(6, 11)],
                             'colsample_bytree': [i/10.0 for i in range(6, 11)],
                             'max_depth': [2, 3, 4],
                         })

    model.fit(X_train, y_train_log_scaled)
    # and set it to Pickle
    with open('./jar/best-model.pkl', 'wb') as best:
        pickle.dump(model, best)

Train and evaluate the model

In [21]:
y_pred_log_scaled = model.predict(X_val)
y_train_pred_log_scaled = model.predict(X_train)

In [22]:
y_pred_log = scaler_Y.inverse_transform(y_pred_log_scaled)
y_train_pred_log = scaler_Y.inverse_transform(y_train_pred_log_scaled)

y_pred = np.exp(y_pred_log)
y_train_pred = np.exp(y_train_pred_log)

In [23]:
train_rmsle = np.sqrt(mean_squared_log_error(y_train, y_train_pred))
val_rmsle = np.sqrt(mean_squared_log_error(y_val, y_pred))

print("Train RMSLE: %f" % (train_rmsle))
print("Val RMSLE: %f" % (val_rmsle))

# def rmsle(y, y_):
#     log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y]))
#     log2 = np.nan_tonum(np.array([np.log(v + 1) for v in y]))
#     calc = (log1 - log2) ** 2
#     return np.sqrt(np.mean(calc))

# print("Train RMSLE is %f" % rmsle(y_train, y_train_pred))
# print("Val RMSLE is %f" % rmsle(y_val, y_pred))

Train RMSLE: 0.218308
Val RMSLE: 0.239291


Make predicitons

In [None]:
y_test_pred = model.predict(X_test)

In [None]:
y_test = np.exp(scaler_Y.inverse_transform(y_test_pred))
y_test