# Analysis: Extreme Gradient Boost

Workflow: 9 

Goal: Load features and target, and run classification.

Result:

In [1]:
import os, sys, time
import pandas as PD
import numpy as NP

import sklearn.impute as IM
import sklearn.preprocessing as PP
import sklearn.decomposition as DC
import imblearn.pipeline as PL

import xgboost as XG
import sklearn.model_selection as MS
import plotly.graph_objects as GO

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
os.chdir(os.getenv('PWD'))
sys.path.insert(0, os.getenv('PWD'))
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'settings.py')
import django
django.setup()

'app_proj.settings'

In [4]:
! pwd

/Users/Phil/Documents/Websites/Movies_Proj/Filmophile/backend


In [5]:
import app_proj.settings as ST
import movies.models.models as MD
import movies.models.analysis as NL

## Target & Features

In [6]:
FEATURE_FILE = os.path.join(ST.BASE_DIR, NL.FEATURE_PATH)
feature_full_df = PD.read_csv(FEATURE_FILE)
feature_full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17668 entries, 0 to 17667
Columns: 625 entries, Movie_ID to Western
dtypes: float64(5), int64(619), object(1)
memory usage: 84.2+ MB


In [7]:
target_ls = MD.UserVotes.objects.values()
target_full_df = PD.DataFrame(target_ls).drop(columns=['id'])
target_full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 3 columns):
Movie_ID    600 non-null int64
User        600 non-null object
Vote        600 non-null int64
dtypes: int64(2), object(1)
memory usage: 14.2+ KB


In [8]:
full_df = PD.merge(target_full_df, feature_full_df, how='left', left_on='Movie_ID', right_on='Movie_ID')
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 600 entries, 0 to 599
Columns: 627 entries, Movie_ID to Western
dtypes: float64(5), int64(620), object(2)
memory usage: 2.9+ MB


In [9]:
# keep only features for movies that have been voted on

feature_df = full_df.drop(columns=['Movie_ID', 'Title', 'User', 'Vote'])
feature_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 600 entries, 0 to 599
Columns: 623 entries, Year to Western
dtypes: float64(5), int64(618)
memory usage: 2.9 MB


In [10]:
target_df = target_full_df.drop(columns=['Movie_ID', 'User'])
target_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 1 columns):
Vote    600 non-null int64
dtypes: int64(1)
memory usage: 4.8 KB


## Run XGBoost

In [11]:
X_np = NP.array(feature_df)
y_np = NP.array(target_df['Vote'])

X_np.shape
y_np.shape

(600, 623)

(600,)

In [12]:
# run xgboost with default arguments as the baseline
# first get split-test-train 

y_train, y_test, X_train, X_test = MS.train_test_split(y_np, X_np, stratify=target_df['Vote'], test_size=0.2)
X_train.shape
y_train.shape

(480, 623)

(480,)

In [13]:
# impute values for PCA ?

imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
imputer.fit(X_train)
X_train_impute = imputer.transform(X_train)
X_train_impute.shape

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

(480, 623)

In [14]:
# scale features

scaler = PP.StandardScaler()
scaler.fit(X_train_impute)
X_train_scale = scaler.transform(X_train_impute)
X_train_scale.shape

StandardScaler(copy=True, with_mean=True, with_std=True)

(480, 623)

In [15]:
#x_pca = DC.PCA(n_components=)

In [16]:
xgb = XG.XGBClassifier(booster='gbtree', objective='multi:softmax', num_class=3);
t0 = time.time()

xgb.fit(X_train_scale, y_train);
y_predict = xgb.predict(X_train_scale);
y_predict[:5]

t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, num_class=3, objective='multi:softprob',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=None, subsample=1, verbosity=1)

array([1, 2, 1, 1, 1])

time: 0.07 mins


In [17]:
# get the training score

train_score = xgb.score(X_train_scale, y_train) *100
print(f'train score: {train_score:.1f}')

train score: 79.6


In [18]:
# get the test score

X_test_impute = imputer.transform(X_test)
X_test_scale = scaler.transform(X_test_impute)
test_score = xgb.score(X_test_scale, y_test) *100
print(f'test score: {test_score:.1f}')

test score: 56.7


In [20]:
# run each step of the grid search

InteractiveShell.ast_node_interactivity = 'last'

In [21]:
# booster

imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
scaler = PP.StandardScaler()
#pca = DC.PCA(n_components=)
xgb = XG.XGBClassifier(objective='multi:softmax', num_class=3)
pipeline = PL.Pipeline([('imputer', imputer), ('scaler', scaler), ('estimator', xgb)])
param_dx = {'estimator__booster': ['gbtree', 'gblinear'],}

searcher = MS.GridSearchCV(pipeline, param_dx, scoring='accuracy', cv=5)
t0 = time.time()
searcher.fit(X_np, y_np);

t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')


The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.



time: 0.49 mins


In [25]:
step1_dx = {
    'param_name': 'xgb-booster',
    'param_values': param_dx['estimator__booster'],
    'test_scores': list(searcher.cv_results_['mean_test_score']),
    'best_value': searcher.best_params_['estimator__booster'],
}
step1_dx

{'param_name': 'xgb-booster',
 'param_values': ['gbtree', 'gblinear'],
 'test_scores': [0.6133333333333333, 0.6283333333333333],
 'best_value': 'gblinear'}

In [23]:
fig = GO.Figure()
colors = ['orange', 'red', 'darkcyan', 'green']

for idx, alg in enumerate(algorithms):
    fig.add_trace(
        GO.Scatter(x = ALPHA, y = alg['test_scores'],
            name=alg['algorithm'], marker={'color': colors[idx]}, mode='lines+markers'))

fig.update_layout(
    title="Logistic Regression CV-Test Scores",
    xaxis_title="lambda",
    yaxis_title="Accuracy",
    width=600,
    height=400,
    margin=GO.layout.Margin(t=50, r=10, b=50, l=70, pad=0),
    paper_bgcolor="LightSteelBlue",
)
fig.update_xaxes(tickvals=ALPHA, type="log")
fig.update_yaxes(range=[0.52, 0.66])

NameError: name 'ALPHA' is not defined