# SVM (Custom Embeddings)

### Libraries

In [1]:
import process_funcs as pf

import pandas as pd
import numpy as np
import gensim

import sklearn
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score

### Parameters

In [2]:
grid_params = {
        'C': [0.1, 1, 10], 
        'kernel': ['rbf', 'linear'],
        'gamma': [5, 10, 15],
        'decision_function_shape': ['ovr', 'ovo']
    }

### Reading Data

In [3]:
train_x, test_x, train_y, test_y, label_map  = pf.get_train_test()

In [4]:
#Sample instance
train_x[0][0:200]

['mps',
 'tout',
 'lords',
 'replacement',
 'plan',
 'group',
 'mps',
 'tried',
 'raise',
 'pressure',
 'tony',
 'blair',
 'reform',
 'house',
 'lords',
 'publishing',
 'detailed',
 'blueprint',
 'change',
 'cross',
 'party',
 'group',
 'unveiled',
 'draft',
 'bill',
 'proposing',
 'smaller',
 'second',
 'chamber',
 '70',
 'members',
 'would',
 'elected',
 'mps',
 'peers',
 'failed',
 'agree',
 'reform',
 'since',
 '1999',
 '600',
 'hereditaries',
 'lost',
 'seats',
 'group',
 'says',
 'win',
 'support',
 'removing',
 'last',
 '92',
 'hereditaries',
 'government',
 'postponed',
 'plans',
 'remove',
 'remaining',
 'hereditary',
 'peers',
 'said',
 'unlikely',
 'succeed',
 'opposition',
 'lords',
 'tony',
 'blair',
 'argued',
 'needs',
 'consensus',
 'reforms',
 'suggestions',
 'proposals',
 'changing',
 'least',
 'powers',
 'lords',
 'labour',
 'manifesto',
 'party',
 'group',
 'including',
 'tories',
 'ken',
 'clarke',
 'sir',
 'george',
 'young',
 'labour',
 'robin',
 'cook',
 'tony',

### Embeddings: Custom

Using custom trained Word2Vec embeddings (on train_x set), *obtain document vectors by taking the average of all found word vectors.* 

In [5]:
#Custom embeddings
word_vecs = pf.get_custom_embeddings()
word_vecs['test']

array([-2.90044099e-01,  3.59430015e-01,  2.48870715e-01, -2.82393415e-02,
        1.41623110e-01, -8.19525063e-01,  2.65341699e-01,  9.85383213e-01,
       -3.41951907e-01, -3.67978811e-01, -3.71961445e-01, -7.59894550e-01,
       -1.44224748e-01,  3.28498483e-01,  1.09700061e-01, -4.06162053e-01,
        3.44611891e-02, -5.55934727e-01,  1.05890691e-01, -1.02582669e+00,
        9.64341983e-02,  4.67433631e-01,  2.92174608e-01, -2.41391033e-01,
        1.74294729e-02, -5.82534149e-02, -3.85833681e-01, -6.68074787e-02,
       -4.87894446e-01,  3.07648517e-02,  2.60165840e-01,  1.24028340e-01,
        3.38057876e-02, -3.44380796e-01, -1.49403095e-01,  3.88319135e-01,
       -2.96244696e-02, -5.40337443e-01, -1.06616601e-01, -8.14761877e-01,
        2.59913713e-01, -4.67677951e-01, -2.61597395e-01, -2.17816103e-02,
        3.42463166e-01, -3.88499916e-01, -4.27326769e-01, -5.06307296e-02,
        3.58640820e-01,  2.96276063e-01,  7.20875803e-03, -5.05444229e-01,
       -3.69120389e-01, -

In [6]:
#Build doc vectors using embeddings
train_x = pf.create_doc_vecs(train_x, word_vecs)
test_x = pf.create_doc_vecs(test_x, word_vecs)

In [7]:
del word_vecs
train_x[0]

array([-2.82748525e-01,  2.60588847e-01,  1.23432209e-01,  7.19064145e-02,
        1.48467112e-01, -6.84281393e-01,  2.35312086e-01,  8.55405963e-01,
       -2.94106046e-01, -3.54220899e-01, -2.58683947e-01, -6.57470230e-01,
       -1.96259837e-01,  2.56551376e-01,  1.55546186e-02, -2.75647881e-01,
        9.61999502e-02, -4.76763188e-01,  1.43492206e-01, -7.76016112e-01,
        1.36625480e-01,  3.94372345e-01,  2.34876551e-01, -2.22466861e-01,
        1.19479772e-01, -3.36868835e-02, -2.64772900e-01, -7.55546702e-02,
       -4.65506827e-01, -1.02797444e-02,  2.74830315e-01, -5.43003564e-03,
       -2.23211568e-02, -3.20921366e-01, -8.63172893e-02,  3.37534658e-01,
       -2.14993816e-02, -4.13767237e-01, -9.87016278e-03, -6.99858874e-01,
        1.64212585e-01, -4.61797754e-01, -2.63425135e-01, -2.16626038e-03,
        2.86316689e-01, -2.87325996e-01, -4.55082103e-01, -3.23940473e-02,
        4.01193486e-01,  2.57032726e-01,  1.09849344e-02, -4.21438531e-01,
       -2.86246671e-01, -

### Modeling

In [8]:
pipeline = Pipeline([
    ("Normalize", Normalizer()),
    ("SVC", GridSearchCV(SVC(), grid_params))])

model, y_preds, accuracy, weighted_f1 = pf.run_model(pipeline, train_x, train_y, test_x, test_y, label_map)
pf.store_results('svm', 'custom', model, accuracy, weighted_f1)

ACCURACY %:  0.8658
WEIGHTED F1:  0.8655
               precision    recall  f1-score   support

     business       0.79      0.89      0.84        64
entertainment       0.89      0.79      0.83        61
     politics       0.86      0.94      0.90        47
        sport       0.94      0.92      0.93        73
         tech       0.84      0.79      0.82        53

     accuracy                           0.87       298
    macro avg       0.87      0.86      0.86       298
 weighted avg       0.87      0.87      0.87       298

