# Analysis: SVM for Development

Workflow: 9D

Goal: Create classification code that can run on server, and store results to database.

In [1]:
import os, sys, time
import pandas as PD
import numpy as NP

import sklearn.impute as IM
import sklearn.preprocessing as PP
import sklearn.decomposition as DC
import sklearn.linear_model as LM

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
os.chdir(os.getenv('PWD'))
sys.path.insert(0, os.getenv('PWD'))
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'settings.py')
import django
django.setup()

'app_proj.settings'

In [4]:
import app_proj.utility as UT
import movies.models.tables as MT
import recommend.models.tables as RT
import recommend.models.analysis as NL

## Target & Features

In [5]:
FEATURE_FILE = os.path.join(UT.BASE_DIR, NL.FEATURE_PATH)
feature_all_df = PD.read_csv(FEATURE_FILE)
feature_all_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17668 entries, 0 to 17667
Columns: 879 entries, Movie_ID to Western
dtypes: float64(5), int64(873), object(1)
memory usage: 118.5+ MB


In [6]:
target_ls = RT.UserVotes.objects.values()
target_df = PD.DataFrame(target_ls).drop(columns=['id'])
target_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881 entries, 0 to 880
Data columns (total 3 columns):
Movie_ID    881 non-null int64
User        881 non-null object
Vote        881 non-null int64
dtypes: int64(2), object(1)
memory usage: 20.8+ KB


In [7]:
target_df['Vote'].value_counts()

1    574
2    231
3     76
Name: Vote, dtype: int64

In [8]:
feature_wtarget_df = feature_all_df.loc[feature_all_df['Movie_ID'].isin(target_df['Movie_ID'])==True]
feature_wtarget_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 881 entries, 27 to 17612
Columns: 879 entries, Movie_ID to Western
dtypes: float64(5), int64(873), object(1)
memory usage: 5.9+ MB


In [9]:
feature_topred_df = feature_all_df.loc[feature_all_df['Movie_ID'].isin(target_df['Movie_ID'])==False]
feature_topred_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16787 entries, 0 to 17667
Columns: 879 entries, Movie_ID to Western
dtypes: float64(5), int64(873), object(1)
memory usage: 112.7+ MB


## Train Algorithm

In [10]:
X_np = NP.array(feature_wtarget_df.drop(columns=['Movie_ID', 'Title']))
y_np = NP.array(target_df['Vote'])

X_np.shape
y_np.shape

(881, 877)

(881,)

In [11]:
imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
X_impute = imputer.fit_transform(X_np)
X_impute.shape

(881, 877)

In [12]:
scaler = PP.StandardScaler()
X_scale = scaler.fit_transform(X_impute)
X_scale.shape

(881, 877)

In [13]:
svm = LM.SGDClassifier(class_weight='balanced');
svm.fit(X_scale, y_np)

SGDClassifier(alpha=0.0001, average=False, class_weight='balanced',
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

## Predict Unwatched Movies

In [14]:
X_pred = NP.array(feature_topred_df.drop(columns=['Movie_ID', 'Title']))
X_pred.shape

(16787, 877)

In [15]:
X_pred_impute = imputer.transform(X_pred)
X_pred_scale = scaler.transform(X_pred_impute)
X_pred_scale.shape

(16787, 877)

In [16]:
predict_np = svm.predict(X_pred_scale)
predict_np.shape

(16787,)

In [17]:
NP.average(predict_np)

1.5110502174301543

In [18]:
# reset index of target-to-predict so iterrows matches output of predict
# pray that reset_index doesn't alter the order ...

feature_tpreset_df = feature_topred_df.reset_index()
feature_tpreset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16787 entries, 0 to 16786
Columns: 880 entries, index to Western
dtypes: float64(5), int64(874), object(1)
memory usage: 112.7+ MB


In [20]:
predict_ls = []
for idx, row in feature_tpreset_df.iterrows():
    movie_id = int(row['Movie_ID'])
    # get movie by Movie_ID, since features aren't part of db yet, though Movie_ID is unique
    movie_md = MT.MasterMovie.objects.get(Movie_ID=movie_id)
    new_dx = {
        'Movie_FK': movie_md,
        'User': 'main',
        'RecomLevel': predict_np[idx],
    }
    predict_ls.append(new_dx)

predict_ls[:2]

[{'Movie_FK': <MasterMovie: MasterMovie object (152035)>,
  'User': 'main',
  'RecomLevel': 1},
 {'Movie_FK': <MasterMovie: MasterMovie object (152036)>,
  'User': 'main',
  'RecomLevel': 1}]

In [21]:
predict_df = PD.DataFrame(predict_ls)
predict_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16787 entries, 0 to 16786
Data columns (total 3 columns):
Movie_FK      16787 non-null object
User          16787 non-null object
RecomLevel    16787 non-null int64
dtypes: int64(1), object(2)
memory usage: 393.6+ KB


In [22]:
predict_df['RecomLevel'].value_counts()

1    10566
2     3863
3     2358
Name: RecomLevel, dtype: int64

In [23]:
InteractiveShell.ast_node_interactivity = 'last'

In [25]:
# insert to db

RT.UserRecommendations.objects.all().delete()
data_obj_ls = [RT.UserRecommendations(**r) for r in predict_ls];
RT.UserRecommendations.objects.bulk_create(data_obj_ls);
RT.UserRecommendations.objects.count()

16787