# Algorithm Tuning: SVM

Workflow: 2 

Goal: Load features and target, and run classification.

Result: For the tuned algorithm, the accuracy is ~6% higher than for the default config.

In [1]:
import os, sys, time
import pandas as PD
import numpy as NP

import sklearn.impute as IM
import sklearn.preprocessing as PP
import sklearn.decomposition as DC
import imblearn.pipeline as PL

import sklearn.linear_model as LM
import sklearn.model_selection as MS
import plotly.graph_objects as GO

In [2]:
import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
os.chdir(os.getenv('PWD'))
sys.path.insert(0, os.getenv('PWD'))
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'settings.py')
import django
django.setup()

'app_proj.settings'

In [4]:
import app_proj.settings as ST
import movies.models.models as MD
import movies.models.analysis as NL

## Target & Features

In [5]:
FEATURE_FILE = os.path.join(ST.BASE_DIR, NL.FEATURE_PATH)
feature_full_df = PD.read_csv(FEATURE_FILE)
feature_full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17668 entries, 0 to 17667
Columns: 879 entries, Movie_ID to Western
dtypes: float64(5), int64(873), object(1)
memory usage: 118.5+ MB


In [6]:
target_ls = MD.UserVotes.objects.values()
target_full_df = PD.DataFrame(target_ls).drop(columns=['id'])
target_full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881 entries, 0 to 880
Data columns (total 3 columns):
Movie_ID    881 non-null int64
User        881 non-null object
Vote        881 non-null int64
dtypes: int64(2), object(1)
memory usage: 20.8+ KB


In [7]:
full_df = PD.merge(target_full_df, feature_full_df, how='left', left_on='Movie_ID', right_on='Movie_ID')
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 881 entries, 0 to 880
Columns: 881 entries, Movie_ID to Western
dtypes: float64(5), int64(874), object(2)
memory usage: 5.9+ MB


In [8]:
# keep only features for movies that have been voted on

feature_df = full_df.drop(columns=['Movie_ID', 'Title', 'User', 'Vote'])
feature_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 881 entries, 0 to 880
Columns: 877 entries, Year to Western
dtypes: float64(5), int64(872)
memory usage: 5.9 MB


In [9]:
target_df = target_full_df.drop(columns=['Movie_ID', 'User'])
target_df.info()
target_df['Vote'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881 entries, 0 to 880
Data columns (total 1 columns):
Vote    881 non-null int64
dtypes: int64(1)
memory usage: 7.0 KB


1    574
2    231
3     76
Name: Vote, dtype: int64

In [10]:
382 / 591 * 100

64.63620981387479

## Run Baseline SVM

In [11]:
X_np = NP.array(feature_df)
y_np = NP.array(target_df['Vote'])

X_np.shape
y_np.shape

(881, 877)

(881,)

In [12]:
# run xgboost with default arguments as the baseline
# first get split-test-train 

y_train, y_test, X_train, X_test = MS.train_test_split(y_np, X_np, stratify=target_df['Vote'], test_size=0.2)
X_train.shape
y_train.shape

(704, 877)

(704,)

In [13]:
# impute values for PCA ?

imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
imputer.fit(X_train)
X_train_impute = imputer.transform(X_train)
X_train_impute.shape

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

(704, 877)

In [14]:
# scale features

scaler = PP.StandardScaler()
scaler.fit(X_train_impute)
X_train_scale = scaler.transform(X_train_impute)
X_train_scale.shape

StandardScaler(copy=True, with_mean=True, with_std=True)

(704, 877)

In [15]:
#x_pca = DC.PCA(n_components=)

In [16]:
svm = LM.SGDClassifier();
t0 = time.time()

svm.fit(X_train_scale, y_train);
y_predict = svm.predict(X_train_scale);
PD.DataFrame(y_predict)[0].value_counts()

t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

1    508
2    158
3     38
Name: 0, dtype: int64

time: 0.00 mins


In [17]:
# get the scores

train_score = svm.score(X_train_scale, y_train) *100
X_test_impute = imputer.transform(X_test)
X_test_scale = scaler.transform(X_test_impute)
test_score = svm.score(X_test_scale, y_test) *100

print(f'train score: {train_score:.1f}')
print(f'test score: {test_score:.1f}')

train score: 74.3
test score: 54.2


## Run Grid Search

steps: 
1. learning rate, eta0, power_t
2. number of iterations   
3. kernel optimization
4. type of regularization, and penalty 
5. loss type
6. class balance

In [18]:
LEARNING = 'invscaling'
ETA0 = 0.01
POWERT = None
MAX_ITER = 200
LOSS = 'epsilon_insensitive'
PENALTY= None
CLASS = None

#InteractiveShell.ast_node_interactivity = 'last'

In [19]:
# define the tuner function 

import copy 

def svm_tuner(trials, algorithm, param_name, param_values):

    param_dx = {f'estimator__{param_name}': param_values}
    best_ls = []
    t0 = time.time()

    score_mx = {}
    for pv in param_values:
        if not pv: pv = 'none'
        score_mx[pv] = []

    for t in range(0, trials):

        if t % 10 == 0: print(f'trial: {t}')

        imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
        scaler = PP.StandardScaler()
        estimator = copy.deepcopy(algorithm)
        pipeline = PL.Pipeline([('imputer', imputer), ('scaler', scaler), ('estimator', estimator)])
        searcher = MS.GridSearchCV(pipeline, param_dx, scoring='accuracy', cv=5)
        searcher.fit(X_np, y_np);

        best_ls.append(searcher.best_params_[f'estimator__{param_name}'])
        score_per_value_ls = list(searcher.cv_results_['mean_test_score'])

        for idx, pv in enumerate(param_values):
            if not pv: pv = 'none'
            percent = score_per_value_ls[idx] *100
            rounded = float(f'{percent:.1f}')
            score_mx[pv].append(rounded)

    t1 = time.time()
    best_value = PD.DataFrame(best_ls)[0].value_counts().index[0]
    mean_trials_ls = [ float(f'{NP.array(score_mx[scr]).mean():.1f}') for scr in score_mx ]
    scores_dx = dict(zip(param_values, mean_trials_ls ))

    print('')
    print(f'best value: {best_value}')
    print(f'best score: {scores_dx[best_value]}')
    print('')
    print(f'time: {(t1-t0)/60:.2f} mins')
    print(f'scores: {scores_dx}')
    print('')
    print( PD.DataFrame(best_ls)[0].value_counts() )

## Run Tuner

In [20]:
# best parameters from tuning

svm = LM.SGDClassifier(learning_rate=LEARNING, eta0=ETA0, max_iter=MAX_ITER, loss=LOSS);
t0 = time.time()

svm.fit(X_train_scale, y_train);
y_predict = svm.predict(X_train_scale);
PD.DataFrame(y_predict)[0].value_counts()

t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.01, fit_intercept=True,
              l1_ratio=0.15, learning_rate='invscaling',
              loss='epsilon_insensitive', max_iter=200, n_iter_no_change=5,
              n_jobs=None, penalty='l2', power_t=0.5, random_state=None,
              shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
              warm_start=False)

1    599
2     78
3     27
Name: 0, dtype: int64

time: 0.00 mins


In [21]:
# get the scores

train_score = svm.score(X_train_scale, y_train) *100
X_test_impute = imputer.transform(X_test)
X_test_scale = scaler.transform(X_test_impute)
test_score = svm.score(X_test_scale, y_test) *100

print(f'train score: {train_score:.1f}')
print(f'test score: {test_score:.1f}')

train score: 78.1
test score: 60.5


In [22]:
# 7 class balance 

tri als = 20
algorithm = LM.SGDClassifier(learning_rate=LEARNING, eta0=ETA0, max_iter=MAX_ITER,
                            loss=LOSS);
param_name = 'class_weight'
param_values = ['balanced'] #, {1: 1, 2: 3, 3: 6}] 

svm_tuner(trials, algorithm, param_name, param_values)

SyntaxError: invalid syntax (<ipython-input-22-4a2d087e35f5>, line 3)

In [None]:
# balancing the classes drops the accuracy by 5%

"""
best value: balanced
best score: 55.9

time: 0.37 mins
scores: {'balanced': 55.9}

balanced    20
"""

In [None]:
# 6 regularization 

trials = 50
algorithm = LM.SGDClassifier(learning_rate=LEARNING, eta0=ETA0, max_iter=MAX_ITER,
                            loss=LOSS);
param_name = 'penalty'
param_values = ['none', 'l2', 'l1'] 

svm_tuner(trials, algorithm, param_name, param_values)

In [None]:
# results are not better than default values
"""
best value: l2
best score: 60.4

time: 2.44 mins
scores: {'none': 60.4, 'l2': 60.4, 'l1': 60.5}

l2      18
none    18
l1      14
"""

In [None]:
# 5 loss type 

tri als = 50
algorithm = LM.SGDClassifier(learning_rate=LEARNING, eta0=ETA0, max_iter=MAX_ITER);
param_name = 'loss'
param_values = ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 
                'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'] 

svm_tuner(trials, algorithm, param_name, param_values)

In [None]:
"""
best value: epsilon_insensitive
best score: 60.5

time: 5.32 mins
scores: {'hinge': 59.6, 'log': 59.2, 'modified_huber': 58.4, 'squared_hinge': 58.4, 'perceptron': 52.3, 'squared_loss': 59.6, 'huber': 57.3, 'epsilon_insensitive': 60.5, 'squared_epsilon_insensitive': 59.0}

epsilon_insensitive    48
squared_loss            2
"""

In [None]:
# 4 max iterations

tri als = 100
algorithm = LM.SGDClassifier(learning_rate=LEARNING, eta0=ETA0);
param_name = 'max_iter'
param_values = [100, 150, 200, 250, 300, 350] 

svm_tuner(trials, algorithm, param_name, param_values)

In [None]:
"""
best value: 100
best score: 59.6

time: 7.48 mins
scores: {100: 59.6, 150: 59.6, 200: 59.6, 250: 59.6, 300: 59.6, 350: 59.6}

100    26
250    19
200    18
150    17
300    11
350     9

200    19
100    17
300    14
400    14
600    13
500    11
700     7
800     5
"""

In [None]:
# 3 learning power-t

tri als = 50
algorithm = LM.SGDClassifier(learning_rate=LEARNING, eta0=ETA0);
param_name = 'power_t'
param_values = [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2]

svm_tuner(trials, algorithm, param_name, param_values)

In [None]:
"""
best value: 0.1
best score: 57.7

time: 2.62 mins
scores: {0.0001: 56.2, 0.001: 56.4, 0.01: 56.5, 0.1: 57.7, 1.0: 48.1, 10.0: 33.5, 100.0: 33.4}

0.1000    44
0.0010     3
0.0100     2
0.0001     1
"""

In [None]:
# 2 learning eta

trials = 50
algorithm = LM.SGDClassifier(learning_rate=LEARNING, eta0=0.1);
param_name = 'eta0'
param_values = [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1]

svm_tuner(trials, algorithm, param_name, param_values)

In [None]:
"""
best value: 0.01
best score: 59.5

time: 6.34 mins
scores: {0.0001: 57.1, 0.001: 57.8, 0.01: 59.5, 0.1: 58.4, 1.0: 56.7, 10.0: 54.4}

0.01    49
0.10     1
"""

In [None]:
# 1 learning rate

trials = 50
algorithm = LM.SGDClassifier(eta0=0.1);
param_name = 'learning_rate'
param_values = ['constant', 'optimal', 'invscaling', 'adaptive']

svm_tuner(trials, algorithm, param_name, param_values)

In [None]:
"""
best value: invscaling
best score: 58.5

time: 2.87 mins
scores: {'constant': 53.6, 'optimal': 53.5, 'invscaling': 58.5, 'adaptive': 56.0}

invscaling    50
"""