# Analysis: XGBoost for Development

Workflow: 9C

Goal: Create classification code that can run on server, and store results to database.

In [1]:
import os, sys, time
import pandas as PD
import numpy as NP

import sklearn.impute as IM
import sklearn.preprocessing as PP
import sklearn.decomposition as DC
import xgboost as XG
import sklearn.utils as SU

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
os.chdir(os.getenv('PWD'))
sys.path.insert(0, os.getenv('PWD'))
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'settings.py')
import django
django.setup()

'app_proj.settings'

In [4]:
import app_proj.settings as ST
import movies.models.models as MD
import movies.models.analysis as NL

## Target & Features

In [5]:
FEATURE_FILE = os.path.join(ST.BASE_DIR, NL.FEATURE_PATH)
feature_all_df = PD.read_csv(FEATURE_FILE)
feature_all_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17668 entries, 0 to 17667
Columns: 229 entries, Movie_ID to Western
dtypes: float64(5), int64(223), object(1)
memory usage: 30.9+ MB


In [6]:
target_ls = MD.UserVotes.objects.values()
target_df = PD.DataFrame(target_ls).drop(columns=['id'])
target_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881 entries, 0 to 880
Data columns (total 3 columns):
Movie_ID    881 non-null int64
User        881 non-null object
Vote        881 non-null int64
dtypes: int64(2), object(1)
memory usage: 20.8+ KB


In [7]:
target_df.head()

Unnamed: 0,Movie_ID,User,Vote
0,37062,main,1
1,207686,main,1
2,430035,main,1
3,403789,main,1
4,334748,main,1


In [8]:
target_df['Vote'].value_counts()

1    574
2    231
3     76
Name: Vote, dtype: int64

In [9]:
feature_wtarget_df = feature_all_df.loc[feature_all_df['Movie_ID'].isin(target_df['Movie_ID'])==True]
feature_wtarget_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 881 entries, 27 to 17612
Columns: 229 entries, Movie_ID to Western
dtypes: float64(5), int64(223), object(1)
memory usage: 1.5+ MB


In [10]:
feature_topred_df = feature_all_df.loc[feature_all_df['Movie_ID'].isin(target_df['Movie_ID'])==False]
feature_topred_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16787 entries, 0 to 17667
Columns: 229 entries, Movie_ID to Western
dtypes: float64(5), int64(223), object(1)
memory usage: 29.5+ MB


## Train Algorithm

In [11]:
X_np = NP.array(feature_wtarget_df.drop(columns=['Movie_ID', 'Title']))
y_np = NP.array(target_df['Vote'])

X_np.shape
y_np.shape

(881, 227)

(881,)

In [12]:
imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
X_impute = imputer.fit_transform(X_np)
X_impute.shape

(881, 227)

In [13]:
scaler = PP.StandardScaler()
X_scale = scaler.fit_transform(X_impute)
X_scale.shape

(881, 227)

In [14]:
MAX_ITER = 2000
LAMBDA = 1e3

xgb = XG.XGBClassifier(objective='multi:softmax', num_class=3, 
                       booster='gblinear', n_estimators=50, reg_alpha=0.05);
xgb.fit(X_scale, y_np)

XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=50, n_jobs=1,
              nthread=None, num_class=3, objective='multi:softprob',
              random_state=0, reg_alpha=0.05, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=None, subsample=1, verbosity=1)

## Predict Unwatched Movies

In [15]:
X_pred = NP.array(feature_topred_df.drop(columns=['Movie_ID', 'Title']))
X_pred.shape

(16787, 227)

In [16]:
X_pred_impute = imputer.transform(X_pred)
X_pred_scale = scaler.transform(X_pred_impute)
X_pred_scale.shape

(16787, 227)

In [17]:
predict_np = xgb.predict(X_pred_scale)
predict_np.shape

numpy.ndarray

(16787,)

In [24]:
NP.average(predict_np)

1.0

In [18]:
# reset index of target-to-predict so iterrows matches output of predict
# pray that reset_index doesn't alter the order ...

feature_tpreset_df = feature_topred_df.reset_index()
feature_tpreset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16787 entries, 0 to 16786
Columns: 230 entries, index to Western
dtypes: float64(5), int64(224), object(1)
memory usage: 29.5+ MB


In [19]:
predict_ls = []
for idx, row in feature_tpreset_df.iterrows():
    movie_id = int(row['Movie_ID'])
    # get movie by Movie_ID, since features aren't part of db yet, though Movie_ID is unique
    movie_md = MD.MasterMovie.objects.get(Movie_ID=movie_id)
    new_dx = {
        'Movie_FK': movie_md,
        'User': 'main',
        'RecomLevel': predict_np[idx],
    }
    predict_ls.append(new_dx)

predict_ls[:2]

[{'Movie_FK': <MasterMovie: MasterMovie object (152035)>,
  'User': 'main',
  'RecomLevel': 1},
 {'Movie_FK': <MasterMovie: MasterMovie object (152036)>,
  'User': 'main',
  'RecomLevel': 1}]

In [20]:
predict_df = PD.DataFrame(predict_ls)
predict_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16787 entries, 0 to 16786
Data columns (total 3 columns):
Movie_FK      16787 non-null object
User          16787 non-null object
RecomLevel    16787 non-null int64
dtypes: int64(1), object(2)
memory usage: 393.6+ KB


In [21]:
predict_df['RecomLevel'].value_counts()

# this is a BIG problem

1    16787
Name: RecomLevel, dtype: int64

In [22]:
InteractiveShell.ast_node_interactivity = 'last'

In [23]:
# insert to db

MD.UserRecommendations.objects.all().delete()
data_obj_ls = [MD.UserRecommendations(**r) for r in predict_ls];
MD.UserRecommendations.objects.bulk_create(data_obj_ls);
MD.UserRecommendations.objects.count()

16787