# Analysis: SVM for Development

Workflow: 9D

Goal: Create classification code that can run on server, and store results to database.

In [1]:
import os, sys, time
import pandas as PD
import numpy as NP

import sklearn.impute as IM
import sklearn.preprocessing as PP
import sklearn.decomposition as DC
import sklearn.linear_model as LM

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
os.chdir(os.getenv('PWD'))
sys.path.insert(0, os.getenv('PWD'))
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'settings.py')
import django
django.setup()

'app_proj.settings'

In [4]:
import app_proj.utility as UT
import movies.models.tables as MT
import recommend.models.tables as RT
import recommend.models.analysis as NL

## Server Version

In [5]:
feature_wtarget_df, target_df, feature_topred_df = NL.FeatureEngineer.GetTargetNFeatures()

feature_wtarget_df.info()
target_df.shape
feature_topred_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 881 entries, 29 to 17612
Columns: 878 entries, Year to Western
dtypes: float64(5), int64(873)
memory usage: 5.9 MB


(881,)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16787 entries, 0 to 17667
Columns: 880 entries, Movie_ID to Western
dtypes: float64(5), int64(874), object(1)
memory usage: 112.8+ MB


In [6]:
svm = NL.SvmClassifier.TrainSvm(feature_wtarget_df, target_df)
svm

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.01, fit_intercept=True,
              l1_ratio=0.15, learning_rate='invscaling',
              loss='epsilon_insensitive', max_iter=200, n_iter_no_change=5,
              n_jobs=None, penalty='l2', power_t=0.5, random_state=None,
              shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
              warm_start=False)

In [7]:
predict_ls = NL.SvmClassifier.PredictUnwatched(svm, feature_wtarget_df, feature_topred_df)
len(predict_ls)
predict_ls[:3]

16787

NameError: name 'predic_ls' is not defined

In [8]:
# put it all together

NL.SvmClassifier.RunRecommendations()
RT.UserRecommendations.objects.count()

16787

## Target & Features

In [None]:
FEATURE_ FILE = os.path.join(UT.BASE_DIR, NL.FEATURE_PATH)
feature_all_df = PD.read_csv(FEATURE_FILE)
feature_all_df.info()

In [None]:
target_ls = RT.UserScores.objects.values()
target_df = PD.DataFrame(target_ls).drop(columns=['id'])
target_df.info()

In [None]:
target_df['Score'].value_counts()

In [None]:
feature_wtarget_df = feature_all_df.loc[feature_all_df['Movie_ID'].isin(target_df['Movie_ID'])==True]
feature_wtarget_df.info()

In [None]:
feature_topred_df = feature_all_df.loc[feature_all_df['Movie_ID'].isin(target_df['Movie_ID'])==False]
feature_topred_df.info()

## Train Algorithm

In [None]:
X_np = NP.array(feature_wtarget_df.drop(columns=['Movie_ID', 'Title']))
y_np = NP.array(target_df['Score'])

X_np.shape
y_np.shape

In [None]:
imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
X_impute = imputer.fit_transform(X_np)
X_impute.shape

In [None]:
scaler = PP.StandardScaler()
X_scale = scaler.fit_transform(X_impute)
X_scale.shape

In [None]:
LEARNING = 'invscaling'
ETA0 = 0.01
POWERT = None
MAX_ITER = 200
LOSS = 'epsilon_insensitive'
PENALTY= None
CLASS = None

svm = LM.SGDClassifier(learning_rate=LEARNING, eta0=ETA0, max_iter=MAX_ITER, loss=LOSS);
svm.fit(X_scale, y_np)

## Predict Unwatched Movies

In [None]:
X_pred = NP.array(feature_topred_df.drop(columns=['Movie_ID', 'Title']))
X_pred.shape

In [None]:
X_pred_impute = imputer.transform(X_pred)
X_pred_scale = scaler.transform(X_pred_impute)
X_pred_scale.shape

In [None]:
predict_np = svm.predict(X_pred_scale)
predict_np.shape

In [None]:
NP.average(predict_np)

In [None]:
# reset index of target-to-predict so iterrows matches output of predict
# pray that reset_index doesn't alter the order ...

feature_tpreset_df = feature_topred_df.reset_index()
feature_tpreset_df.info()

In [None]:
predict_ls = []
for idx, row in feature_tpreset_df.iterrows():
    movie_id = int(row['Movie_ID'])
    # get movie by Movie_ID, since features aren't part of db yet, though Movie_ID is unique
    movie_md = MT.MasterMovie.objects.get(Movie_ID=movie_id)
    new_dx = {
        'Movie_FK': movie_md,
        'User': 'main',
        'RecomLevel': predict_np[idx],
    }
    predict_ls.append(new_dx)

predict_ls[:2]

In [None]:
predict_df = PD.DataFrame(predict_ls)
predict_df.info()

In [None]:
predict_df['RecomLevel'].value_counts()

In [None]:
InteractiveShell.ast_node_interactivity = 'last'

In [None]:
# insert to db

RT.UserRecommendations.objects.all().delete()
data_obj_ls = [RT.UserRecommendations(**r) for r in predict_ls];
RT.UserRecommendations.objects.bulk_create(data_obj_ls);
RT.UserRecommendations.objects.count()