# SVM (Word2Vec Embeddings)

### Libraries

In [1]:
import process_funcs as pf

import pandas as pd
import numpy as np
import gensim

import sklearn
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score

### Parameters

In [2]:
grid_params = {
        'C': [0.1, 1, 10], 
        'kernel': ['rbf', 'linear'],
        'gamma': [5, 10, 15],
        'decision_function_shape': ['ovr', 'ovo']
    }

### Reading Data

In [3]:
train_x, test_x, train_y, test_y, label_map  = pf.get_train_test()

In [4]:
#Sample instance
train_x[0][0:200]

['mps',
 'tout',
 'lords',
 'replacement',
 'plan',
 'group',
 'mps',
 'tried',
 'raise',
 'pressure',
 'tony',
 'blair',
 'reform',
 'house',
 'lords',
 'publishing',
 'detailed',
 'blueprint',
 'change',
 'cross',
 'party',
 'group',
 'unveiled',
 'draft',
 'bill',
 'proposing',
 'smaller',
 'second',
 'chamber',
 '70',
 'members',
 'would',
 'elected',
 'mps',
 'peers',
 'failed',
 'agree',
 'reform',
 'since',
 '1999',
 '600',
 'hereditaries',
 'lost',
 'seats',
 'group',
 'says',
 'win',
 'support',
 'removing',
 'last',
 '92',
 'hereditaries',
 'government',
 'postponed',
 'plans',
 'remove',
 'remaining',
 'hereditary',
 'peers',
 'said',
 'unlikely',
 'succeed',
 'opposition',
 'lords',
 'tony',
 'blair',
 'argued',
 'needs',
 'consensus',
 'reforms',
 'suggestions',
 'proposals',
 'changing',
 'least',
 'powers',
 'lords',
 'labour',
 'manifesto',
 'party',
 'group',
 'including',
 'tories',
 'ken',
 'clarke',
 'sir',
 'george',
 'young',
 'labour',
 'robin',
 'cook',
 'tony',

### Embeddings: Word2Vec

Using Google's pretrained Word2Vec embeddings, *obtain document vectors by taking the average of all found word vectors from Google's embeddings.* 

In [5]:
#Google embeddings
word_vecs = gensim.models.KeyedVectors.load_word2vec_format('./embeddings/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [6]:
#Build doc vectors using embeddings
train_x = pf.create_doc_vecs(train_x, word_vecs)
test_x = pf.create_doc_vecs(test_x, word_vecs)

In [7]:
del word_vecs
train_x[0]

array([-0.02218279,  0.04985856,  0.02728415,  0.08805169, -0.08756893,
       -0.07085403,  0.01250593, -0.04748941,  0.0890011 ,  0.04806667,
       -0.0635045 , -0.07388218, -0.0457906 ,  0.04253522, -0.10962959,
        0.10289368,  0.0804654 ,  0.06638121, -0.01923976, -0.0799837 ,
        0.01506541,  0.03490134,  0.03741951,  0.01686136,  0.00324339,
       -0.03654537, -0.09544601,  0.06385504, -0.01494876, -0.00396332,
        0.02528003, -0.0282175 , -0.03472966,  0.02006287,  0.03376151,
        0.00632371,  0.02322263,  0.0582406 ,  0.00092155,  0.0321481 ,
        0.05439585, -0.06463421,  0.12898988, -0.00302689, -0.06303241,
       -0.09617973, -0.04804834, -0.02572512, -0.08551448,  0.00611576,
        0.00528029, -0.00515997,  0.04007765,  0.00104629, -0.02698291,
        0.0309202 , -0.09233812, -0.02223878, -0.0095085 , -0.07138786,
       -0.03126817,  0.06862007, -0.08335931, -0.07144118,  0.00708394,
       -0.04007832, -0.03921117,  0.11604283, -0.00284277,  0.05

### Modeling

In [8]:
pipeline = Pipeline([
    ("Normalize", Normalizer()),
    ("SVC", GridSearchCV(SVC(), grid_params))])

model, y_preds, accuracy, weighted_f1 = pf.run_model(pipeline, train_x, train_y, test_x, test_y, label_map)
pf.store_results('svm', 'word2vec', model, accuracy, weighted_f1)

ACCURACY %:  0.9631
WEIGHTED F1:  0.9632
               precision    recall  f1-score   support

     business       0.95      0.94      0.94        64
entertainment       0.98      0.97      0.98        61
     politics       0.94      0.94      0.94        47
        sport       1.00      0.99      0.99        73
         tech       0.93      0.98      0.95        53

     accuracy                           0.96       298
    macro avg       0.96      0.96      0.96       298
 weighted avg       0.96      0.96      0.96       298

