# The XGBoost 1 feature regression

## Imports

### Py Imports

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sbn
import sklearn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import pickle
import os.path as path

### Datasets import

In [2]:
train = pd.read_csv('./sources/clean_train.csv', index_col=0)
test = pd.read_csv('./sources/clean_test.csv', index_col=0)

### Load variables

In [3]:
with open('./jar/columns.pkl', 'rb') as col:
    num_col, cat_col = pickle.load(col)

## work on dataset

### Scale datas

In [4]:
Y_train_full = train['SalePrice']
X_train_full = train.drop('SalePrice', axis=1)

In [5]:
train.shape

(1457, 221)

In [6]:
X_train_full.shape

(1457, 220)

In [7]:
Y_train_full.shape

(1457,)

In [8]:
# Define a function to reduce dataframe with one 'Surface' feature

def set_one_feature(dataframe):
    dataframe['Surface'] = dataframe['GrLivArea'] + dataframe['TotalBsmtSF']
    return dataframe

In [9]:
set_one_feature(X_train_full)
X_train_full = X_train_full.Surface.values.astype(float)
X_train_full = np.array(X_train_full).reshape((-1, 1))

In [10]:
X_test = set_one_feature(test)
type(X_test)

pandas.core.frame.DataFrame

In [11]:
X_test = X_test.Surface.values.astype(float)
X_test = np.array(X_test).reshape((-1, 1))

In [12]:
Y_train_full = train.SalePrice.values.astype(float)
Y_train_full = np.array(Y_train_full).reshape((-1, 1))

In [13]:
Y_train_full.shape

(1457, 1)

In [14]:
X_train_full.shape

(1457, 1)

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X_train_full, Y_train_full, test_size=0.2, random_state=42)

# Save in Pickle for further use

with open('./jar/train-test.pkl', 'wb') as tts:
    pickle.dump((X_train, X_val, y_train, y_val), tts)

Normalize Data

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

scaler_Y = scaler.fit(Y_train)
Y_train = scaler_Y.transform(Y_train)

## Model building

Split the train set into train and validation sets

In [17]:
# Set model with best params

model = xgb.XGBRegressor(objective='reg:squarederror')

if path.isfile('./jar/best-model.pkl'):
    with open('./jar/best-model.pkl', 'rb') as best_model:
        model = pickle.load(best_model)
else:
    # Find the best suited model

    model = GridSearchCV(model,
                         cv=5,
                         param_grid={
                             'kernel': ('linear',),
                             'min_child_weight': [4, 5],
                             'gamma': [i/10.0 for i in range(3, 6)],
                             'subsample': [i/10.0 for i in range(6, 11)],
                             'colsample_bytree': [i/10.0 for i in range(6, 11)],
                             'max_depth': [2, 3, 4],
                         })

    model.fit(X_train, y_train)
    # and set it to Pickle
    with open('./jar/best-model.pkl', 'wb') as best:
        pickle.dump(model, best)

Train and evaluate the model

In [18]:
y_preds = model.predict(X_val)
y_train_pred = model.predict(X_train)

In [19]:
val_rmse = np.sqrt(mean_squared_error(y_val, y_preds))
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))

print("Train RMSE: %f" % (train_rmse))
print("Val RMSE: %f" % (val_rmse))

Train RMSE: 0.221002
Val RMSE: 0.239423


Make predicitons

In [20]:
y_test_pred = model.predict(X_test)

In [22]:
y_test = np.exp(y_test_pred)
y_test

array([120013.125, 178645.   , 170637.31 , ..., 170637.31 , 128423.195,
       182173.67 ], dtype=float32)