# Analysis: Extreme Gradient Boost

Workflow: 9 

Goal: Load features and target, and run classification.

Result:

In [1]:
import os, sys, time
import pandas as PD
import numpy as NP

import sklearn.impute as IM
import sklearn.preprocessing as PP
import sklearn.decomposition as DC
import imblearn.pipeline as PL

import xgboost as XG
import sklearn.utils as SU
import sklearn.model_selection as MS
import plotly.graph_objects as GO

In [2]:
import sklearn
print(f'scikit-learn version: {sklearn.__version__}')

scikit-learn version: 0.21.3


In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [4]:
os.chdir(os.getenv('PWD'))
sys.path.insert(0, os.getenv('PWD'))
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'settings.py')
import django
django.setup()

'app_proj.settings'

In [5]:
! pwd

/Users/Phil/Documents/Websites/Movies_Proj/Filmophile/backend


In [6]:
import app_proj.settings as ST
import movies.models.models as MD
import movies.models.analysis as NL

## Target & Features

In [7]:
FEATURE_FILE = os.path.join(ST.BASE_DIR, NL.FEATURE_PATH)
feature_full_df = PD.read_csv(FEATURE_FILE)
feature_full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17668 entries, 0 to 17667
Columns: 594 entries, Movie_ID to Western
dtypes: float64(5), int64(588), object(1)
memory usage: 80.1+ MB


In [8]:
target_ls = MD.UserVotes.objects.values()
target_full_df = PD.DataFrame(target_ls).drop(columns=['id'])
target_full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 591 entries, 0 to 590
Data columns (total 3 columns):
Movie_ID    591 non-null int64
User        591 non-null object
Vote        591 non-null int64
dtypes: int64(2), object(1)
memory usage: 14.0+ KB


In [9]:
full_df = PD.merge(target_full_df, feature_full_df, how='left', left_on='Movie_ID', right_on='Movie_ID')
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 591 entries, 0 to 590
Columns: 596 entries, Movie_ID to Western
dtypes: float64(5), int64(589), object(2)
memory usage: 2.7+ MB


In [10]:
# keep only features for movies that have been voted on

feature_df = full_df.drop(columns=['Movie_ID', 'Title', 'User', 'Vote'])
feature_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 591 entries, 0 to 590
Columns: 592 entries, Year to Western
dtypes: float64(5), int64(587)
memory usage: 2.7 MB


In [11]:
target_df = target_full_df.drop(columns=['Movie_ID', 'User'])
target_df.info()
target_df['Vote'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 591 entries, 0 to 590
Data columns (total 1 columns):
Vote    591 non-null int64
dtypes: int64(1)
memory usage: 4.7 KB


1    382
2    154
3     55
Name: Vote, dtype: int64

In [24]:
382 / 591 * 100

64.63620981387479

## Run Baseline XGBoost

In [12]:
X_np = NP.array(feature_df)
y_np = NP.array(target_df['Vote'])

X_np.shape
y_np.shape

(591, 592)

(591,)

In [13]:
# run xgboost with default arguments as the baseline
# first get split-test-train 

y_train, y_test, X_train, X_test = MS.train_test_split(y_np, X_np, stratify=target_df['Vote'], test_size=0.2)
X_train.shape
y_train.shape

(472, 592)

(472,)

In [14]:
# impute values for PCA ?

imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
imputer.fit(X_train)
X_train_impute = imputer.transform(X_train)
X_train_impute.shape

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

(472, 592)

In [15]:
# scale features

scaler = PP.StandardScaler()
scaler.fit(X_train_impute)
X_train_scale = scaler.transform(X_train_impute)
X_train_scale.shape

StandardScaler(copy=True, with_mean=True, with_std=True)

(472, 592)

In [16]:
#x_pca = DC.PCA(n_components=)

In [17]:
xgb = XG.XGBClassifier(booster='gbtree', objective='multi:softmax', num_class=3);
t0 = time.time()

xgb.fit(X_train_scale, y_train);
y_predict = xgb.predict(X_train_scale);
y_predict[:5]

t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, num_class=3, objective='multi:softprob',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=None, subsample=1, verbosity=1)

array([1, 1, 1, 1, 1])

time: 0.07 mins


In [18]:
# get the training score

train_score = xgb.score(X_train_scale, y_train) *100
print(f'train score: {train_score:.1f}')

train score: 78.8


In [19]:
# get the test score

X_test_impute = imputer.transform(X_test)
X_test_scale = scaler.transform(X_test_impute)
test_score = xgb.score(X_test_scale, y_test) *100
print(f'test score: {test_score:.1f}')

test score: 63.9


## Run Grid Search XGBoost

In [20]:
# run each step of the grid search

InteractiveShell.ast_node_interactivity = 'last'

In [25]:
# 0] learning rate

imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
scaler = PP.StandardScaler()
#pca = DC.PCA(n_components=)
xgb = XG.XGBClassifier(objective='multi:softmax', num_class=3, booster='gblinear')
pipeline = PL.Pipeline([('imputer', imputer), ('scaler', scaler), ('estimator', xgb)])
param_dx = {'estimator__learning_rate': [0.001, 0.01, 0.05, 0.1],}

searcher = MS.GridSearchCV(pipeline, param_dx, scoring='accuracy', cv=5)
t0 = time.time()
searcher.fit(X_np, y_np);

t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')

time: 0.55 mins


In [26]:
param_name = 'learning_rate'
step0_dx = {
    'param_name': param_name,
    'param_values': param_dx[f'estimator__{param_name}'],
    'test_scores': [round(s*100,2) for s in list(searcher.cv_results_['mean_test_score'])],
    'best_value': searcher.best_params_[f'estimator__{param_name}'],
}
step0_dx

{'param_name': 'learning_rate',
 'param_values': [0.001, 0.01, 0.05, 0.1],
 'test_scores': [62.77, 63.28, 63.62, 63.62],
 'best_value': 0.05}

In [22]:
st op = 

SyntaxError: invalid syntax (<ipython-input-22-f3d9a8454a59>, line 1)

In [None]:
# 1] booster

imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
scaler = PP.StandardScaler()
#pca = DC.PCA(n_components=)
xgb = XG.XGBClassifier(objective='multi:softmax', num_class=3)
pipeline = PL.Pipeline([('imputer', imputer), ('scaler', scaler), ('estimator', xgb)])
param_dx = {'estimator__booster': ['gbtree', 'gblinear', 'dart'],}

searcher = MS.GridSearchCV(pipeline, param_dx, scoring='accuracy', cv=5)
t0 = time.time()
searcher.fit(X_np, y_np);

t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')

In [None]:
param_name = 'booster'
step1_dx = {
    'param_name': param_name,
    'param_values': param_dx[f'estimator__{param_name}'],
    'test_scores': [round(s*100,2) for s in list(searcher.cv_results_['mean_test_score'])],
    'best_value': searcher.best_params_[f'estimator__{param_name}'],
}
step1_dx

In [None]:
# get weight class for step 2

weights_raw = list(SU.class_weight.compute_class_weight('balanced', NP.unique(y_np), y_np))
weights_dx = {1: weights_raw[0], 2: weights_raw[1], 3: weights_raw[2]}
weights_dx

In [None]:
# booster with weighted classes

imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
scaler = PP.StandardScaler()
#pca = DC.PCA(n_components=)
xgb = XG.XGBClassifier(objective='multi:softmax', num_class=3, sample_weight2=weights_dx)
pipeline = PL.Pipeline([('imputer', imputer), ('scaler', scaler), ('estimator', xgb)])
param_dx = {'estimator__booster': ['gbtree', 'gblinear'],}

searcher = MS.GridSearchCV(pipeline, param_dx, scoring='accuracy', cv=5, )
t0 = time.time()
searcher.fit(X_np, y_np);

t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')

In [None]:
# param_name = 'booster-weight'
# step2_dx = {
#     'param_name': param_name,
#     'param_values': param_dx[f'estimator__{param_name}'],
#     'test_scores': list(searcher.cv_results_['mean_test_score']),
#     'best_value': searcher.best_params_[f'estimator__{param_name}'],
# }
# step2_dx

In [None]:
# 3] number of estimators

imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
scaler = PP.StandardScaler()
#pca = DC.PCA(n_components=)
xgb = XG.XGBClassifier(objective='multi:softmax', num_class=3, 
                       booster=step1_dx['best_value'])
pipeline = PL.Pipeline([('imputer', imputer), ('scaler', scaler), ('estimator', xgb)])
param_dx = {'estimator__n_estimators': [25, 30, 35, 40, 45, 50, 55, 100],}

searcher = MS.GridSearchCV(pipeline, param_dx, scoring='accuracy', cv=5, )
t0 = time.time()
searcher.fit(X_np, y_np);

t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')

In [None]:
param_name = 'n_estimators'
step3_dx = {
    'param_name': param_name,
    'param_values': param_dx[f'estimator__{param_name}'],
    'test_scores': [round(s*100,2) for s in list(searcher.cv_results_['mean_test_score'])],
    'best_value': searcher.best_params_[f'estimator__{param_name}'],
}
step3_dx

In [None]:
# 4] maximum depth

imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
scaler = PP.StandardScaler()
#pca = DC.PCA(n_components=)
xgb = XG.XGBClassifier(objective='multi:softmax', num_class=3, 
                       booster=step1_dx['best_value'], n_estimators=step3_dx['best_value'])
pipeline = PL.Pipeline([('imputer', imputer), ('scaler', scaler), ('estimator', xgb)])
param_dx = {'estimator__max_depth': [1, 2, 3, 4, 5],}

searcher = MS.GridSearchCV(pipeline, param_dx, scoring='accuracy', cv=5, )
t0 = time.time()
searcher.fit(X_np, y_np);

t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')

In [None]:
param_name = 'max_depth'
step4_dx = {
    'param_name': param_name,
    'param_values': param_dx[f'estimator__{param_name}'],
    'test_scores': [round(s*100,2) for s in list(searcher.cv_results_['mean_test_score'])],
    'best_value': searcher.best_params_[f'estimator__{param_name}'],
}
step4_dx

In [None]:
# 5] minimum child weight

imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
scaler = PP.StandardScaler()
#pca = DC.PCA(n_components=)
xgb = XG.XGBClassifier(objective='multi:softmax', num_class=3, 
                       booster=step1_dx['best_value'], n_estimators=step3_dx['best_value'],
                       max_depth=step4_dx['best_value'])
pipeline = PL.Pipeline([('imputer', imputer), ('scaler', scaler), ('estimator', xgb)])
param_dx = {'estimator__min_child_weight': [1, 2, 3, 4],}

searcher = MS.GridSearchCV(pipeline, param_dx, scoring='accuracy', cv=5, )
t0 = time.time()
searcher.fit(X_np, y_np);

t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')

In [None]:
param_name = 'min_child_weight'
step5_dx = {
    'param_name': param_name,
    'param_values': param_dx[f'estimator__{param_name}'],
    'test_scores': [round(s*100,2) for s in list(searcher.cv_results_['mean_test_score'])],
    'best_value': searcher.best_params_[f'estimator__{param_name}'],
}
step5_dx

In [None]:
# 6] gamma

imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
scaler = PP.StandardScaler()
#pca = DC.PCA(n_components=)
xgb = XG.XGBClassifier(objective='multi:softmax', num_class=3, 
                       booster=step1_dx['best_value'], n_estimators=step3_dx['best_value'],
                       max_depth=step4_dx['best_value'], min_child_weight=step5_dx['best_value'],)
pipeline = PL.Pipeline([('imputer', imputer), ('scaler', scaler), ('estimator', xgb)])
param_dx = {'estimator__gamma': [0, 0.05, 0.1, 0.2, 0.3],}

searcher = MS.GridSearchCV(pipeline, param_dx, scoring='accuracy', cv=5, )
t0 = time.time()
searcher.fit(X_np, y_np);

t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')

In [None]:
param_name = 'gamma'
step6_dx = {
    'param_name': param_name,
    'param_values': param_dx[f'estimator__{param_name}'],
    'test_scores': [round(s*100,2) for s in list(searcher.cv_results_['mean_test_score'])],
    'best_value': searcher.best_params_[f'estimator__{param_name}'],
}
step6_dx

In [None]:
# 7] L1 regularization

imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
scaler = PP.StandardScaler()
#pca = DC.PCA(n_components=)
xgb = XG.XGBClassifier(objective='multi:softmax', num_class=3, 
                       booster=step1_dx['best_value'], 
                       n_estimators=step3_dx['best_value'], max_depth=step4_dx['best_value'], 
                       min_child_weight=step5_dx['best_value'], gamma=step6_dx['best_value'],)
pipeline = PL.Pipeline([('imputer', imputer), ('scaler', scaler), ('estimator', xgb)])
param_dx = {'estimator__reg_alpha': [1e-3, 5e-2, 1e-2, 5e-1, 1e-1, 0, 1e0],}

searcher = MS.GridSearchCV(pipeline, param_dx, scoring='accuracy', cv=5, )
t0 = time.time()
searcher.fit(X_np, y_np);

t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')

In [None]:
param_name = 'reg_alpha'
step7_dx = {
    'param_name': param_name,
    'param_values': param_dx[f'estimator__{param_name}'],
    'test_scores': [round(s*100,2) for s in list(searcher.cv_results_['mean_test_score'])],
    'best_value': searcher.best_params_[f'estimator__{param_name}'],
}
step7_dx

In [None]:
# 8] L2 regularization

imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
scaler = PP.StandardScaler()
#pca = DC.PCA(n_components=)
xgb = XG.XGBClassifier(objective='multi:softmax', num_class=3, 
                       booster=step1_dx['best_value'], 
                       n_estimators=step3_dx['best_value'], max_depth=step4_dx['best_value'], 
                       min_child_weight=step5_dx['best_value'], gamma=step6_dx['best_value'],
                       reg_alpha=step7_dx['best_value'],)
pipeline = PL.Pipeline([('imputer', imputer), ('scaler', scaler), ('estimator', xgb)])
param_dx = {'estimator__reg_lambda': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0],}

searcher = MS.GridSearchCV(pipeline, param_dx, scoring='accuracy', cv=5, )
t0 = time.time()
searcher.fit(X_np, y_np);

t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')

In [None]:
param_name = 'reg_lambda'
step8_dx = {
    'param_name': param_name,
    'param_values': param_dx[f'estimator__{param_name}'],
    'test_scores': [round(s*100,2) for s in list(searcher.cv_results_['mean_test_score'])],
    'best_value': searcher.best_params_[f'estimator__{param_name}'],
}
step8_dx

In [None]:
# all the best

imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
scaler = PP.StandardScaler()
#pca = DC.PCA(n_components=)
xgb = XG.XGBClassifier(objective='multi:softmax', num_class=3, 
                        
                       n_estimators=step3_dx['best_value'], max_depth=step4_dx['best_value'], 
                       min_child_weight=step5_dx['best_value'], gamma=step6_dx['best_value'],
                       reg_alpha=step7_dx['best_value'], reg_lambda=step8_dx['best_value'],)
pipeline = PL.Pipeline([('imputer', imputer), ('scaler', scaler), ('estimator', xgb)])
param_dx = {'estimator__booster': ['gbtree', 'gblinear', 'dart'],}

searcher = MS.GridSearchCV(pipeline, param_dx, scoring='accuracy', cv=5, )
t0 = time.time()
searcher.fit(X_np, y_np);

t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')

In [None]:
param_name = 'booster'
stepF_dx = {
    'param_name': param_name,
    'param_values': param_dx[f'estimator__{param_name}'],
    'test_scores': [round(s*100,2) for s in list(searcher.cv_results_['mean_test_score'])],
    'best_value': searcher.best_params_[f'estimator__{param_name}'],
}
stepF_dx

In [None]:
fig = GO.Figure()
colors = ['orange', 'red', 'darkcyan', 'green']

for idx, alg in enumerate(algorithms):
    fig.add_trace(
        GO.Scatter(x = ALPHA, y = alg['test_scores'],
            name=alg['algorithm'], marker={'color': colors[idx]}, mode='lines+markers'))

fig.update_layout(
    title="Logistic Regression CV-Test Scores",
    xaxis_title="lambda",
    yaxis_title="Accuracy",
    width=600,
    height=400,
    margin=GO.layout.Margin(t=50, r=10, b=50, l=70, pad=0),
    paper_bgcolor="LightSteelBlue",
)
fig.update_xaxes(tickvals=ALPHA, type="log")
fig.update_yaxes(range=[0.52, 0.66])