# Gradient Boosting Model

In [22]:
import numpy as np
import pandas as pd

In [23]:
pd.options.display.max_rows = None
pd.options.display.max_colwidth = -1
pd.options.display.width = None
pd.options.display.max_columns = None

In [24]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:                
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)        

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB --> {:.2f} MB (Decreased by {:.1f}%)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [25]:
df= pd.read_csv('../files/Kaggle Competition/House Prediction/final_data_cleaned_dummies.csv',encoding = 'iso-8859-1')

In [26]:
df = reduce_mem_usage(df)

Memory usage of dataframe is 90.01 MB --> 19.16 MB (Decreased by 78.7%)


In [27]:
x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [28]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 0)

In [29]:
from xgboost import XGBRegressor

In [30]:
%%time
xg = XGBRegressor(n_estimators=200, n_jobs=-1, objective='reg:squarederror', max_depth=10)
xg.fit(x_train,y_train)

Wall time: 2min 1s


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=10, min_child_weight=1, missing=None, n_estimators=200,
             n_jobs=-1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [31]:
xg.score(x_test,y_test)

0.9337570505549619

In [33]:
# k-fold Cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = xg, X = x_train, y = y_train, cv = 10, n_jobs=-1)
accuracies.mean()

0.9273423361227081

In [12]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

In [15]:
%%time
# Create a pipeline
pipe = make_pipeline((XGBRegressor()))
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
                {"xgbregressor": [XGBRegressor()],
                 "xgbregressor__n_estimators": [100,200],
                 "xgbregressor__max_depth":[10,15,20]                 
                 }]
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=10, verbose=2,n_jobs=7) # Fit grid search
best_model = gridsearch.fit(x_train,y_train)
print(best_model.best_score_)
print(best_model.best_params_)
print(best_model.best_estimator_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  30 out of  30 | elapsed: 25.7min finished


0.9243505651953151
{'xgbregressor': XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=10, min_child_weight=1, missing=None, n_estimators=200,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1), 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 200}
Pipeline(memory=None,
         steps=[('xgbregressor',
                 XGBRegressor(base_score=0.5, booster='gbtree',
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=1, gamma=0,
                              importance_type='gain', learning_rate=0.1,
                              max_delta_step=0, max_depth=10,
                              

# Finalized parameters

n_estimators = 200, max_depth = 10

In [20]:
import pickle

In [21]:
pickle.dump(xg, open('xgboost_model.pkl', 'wb'))