# Analysis: Development

Workflow: 9B

Goal: Create classification code that can run on server, and store results to database.

In [1]:
import os, sys, time
import pandas as PD
import numpy as NP

import sklearn.impute as IM
import sklearn.preprocessing as PP
import sklearn.decomposition as DC
import sklearn.linear_model as LM

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
os.chdir(os.getenv('PWD'))
sys.path.insert(0, os.getenv('PWD'))
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'settings.py')
import django
django.setup()

'app_proj.settings'

In [4]:
import app_proj.settings as ST
import movies.models.models as MD
import movies.models.analysis as NL

## Target & Features

In [5]:
FEATURE_FILE = os.path.join(ST.BASE_DIR, NL.FEATURE_PATH)
feature_all_df = PD.read_csv(FEATURE_FILE)
feature_all_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17668 entries, 0 to 17667
Columns: 605 entries, Movie_ID to Western
dtypes: float64(5), int64(599), object(1)
memory usage: 81.6+ MB


In [6]:
target_ls = MD.UserVotes.objects.values()
target_df = PD.DataFrame(target_ls).drop(columns=['id'])
target_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 591 entries, 0 to 590
Data columns (total 3 columns):
Movie_ID    591 non-null int64
User        591 non-null object
Vote        591 non-null int64
dtypes: int64(2), object(1)
memory usage: 14.0+ KB


In [7]:
feature_wtarget_df = feature_all_df.loc[feature_all_df['Movie_ID'].isin(target_df['Movie_ID'])==True]
feature_wtarget_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 591 entries, 27 to 17513
Columns: 605 entries, Movie_ID to Western
dtypes: float64(5), int64(599), object(1)
memory usage: 2.7+ MB


In [8]:
feature_topred_df = feature_all_df.loc[feature_all_df['Movie_ID'].isin(target_df['Movie_ID'])==False]
feature_topred_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17077 entries, 0 to 17667
Columns: 605 entries, Movie_ID to Western
dtypes: float64(5), int64(599), object(1)
memory usage: 79.0+ MB


## Train Algorithm

In [9]:
X_np = NP.array(feature_wtarget_df.drop(columns=['Movie_ID', 'Title']))
y_np = NP.array(target_df['Vote'])

X_np.shape
y_np.shape

(591, 603)

(591,)

In [10]:
imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
X_impute = imputer.fit_transform(X_np)
X_impute.shape

(591, 603)

In [11]:
scaler = PP.StandardScaler()
X_scale = scaler.fit_transform(X_impute)
X_scale.shape

(591, 603)

In [12]:
MAX_ITER = 2000
LAMBDA = 1e3

logreg = LM.LogisticRegression(solver='liblinear', penalty='l1', C=(1/LAMBDA), multi_class='auto', max_iter=MAX_ITER)
logreg.fit(X_scale, y_np)

LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=2000,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

## Predict Unwatched Movies

In [13]:
X_pred = NP.array(feature_topred_df.drop(columns=['Movie_ID', 'Title']))
X_pred.shape

(17077, 603)

In [14]:
X_pred_impute = imputer.transform(X_pred)
X_pred_scale = scaler.transform(X_pred_impute)
X_pred_scale.shape

(17077, 603)

In [15]:
predict_np = logreg.predict(X_pred_scale)
type(predict_np)
predict_np.shape

numpy.ndarray

(17077,)

In [16]:
# reset index of target-to-predict so iterrows matches output of predict
# pray that reset_index doesn't alter the order ...

feature_tpreset_df = feature_topred_df.reset_index()
feature_tpreset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17077 entries, 0 to 17076
Columns: 606 entries, index to Western
dtypes: float64(5), int64(600), object(1)
memory usage: 79.0+ MB


In [17]:
predict_ls = []
for idx, row in feature_tpreset_df.iterrows():
    movie_id = int(row['Movie_ID'])
    # get movie by Movie_ID, since features aren't part of db yet, though Movie_ID is unique
    movie_md = MD.MasterMovie.objects.get(Movie_ID=movie_id)
    new_dx = {
        'Movie_FK': movie_md,
        'User': 'main',
        'RecomLevel': predict_np[idx],
    }
    predict_ls.append(new_dx)

predict_ls[:2]

[{'Movie_FK': <MasterMovie: MasterMovie object (152035)>,
  'User': 'main',
  'RecomLevel': 1},
 {'Movie_FK': <MasterMovie: MasterMovie object (152036)>,
  'User': 'main',
  'RecomLevel': 1}]

In [18]:
predict_df = PD.DataFrame(predict_ls)
predict_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17077 entries, 0 to 17076
Data columns (total 3 columns):
Movie_FK      17077 non-null object
User          17077 non-null object
RecomLevel    17077 non-null int64
dtypes: int64(1), object(2)
memory usage: 400.4+ KB


In [19]:
predict_df['RecomLevel'].value_counts()

1    17077
Name: RecomLevel, dtype: int64

In [20]:
InteractiveShell.ast_node_interactivity = 'last'

In [21]:
# insert to db

MD.UserRecommendations.objects.all().delete()
data_obj_ls = [MD.UserRecommendations(**r) for r in predict_ls];
MD.UserRecommendations.objects.bulk_create(data_obj_ls);
MD.UserRecommendations.objects.count()

17077