# SVM (FastText Embeddings)

### Libraries

In [1]:
import process_funcs as pf

import pandas as pd
import numpy as np

import sklearn
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import GridSearchCV

### Parameters

In [2]:
grid_params = {
        'C': [0.1, 1, 10], 
        'kernel': ['rbf', 'linear'],
        'gamma': [5, 10, 15],
        'decision_function_shape': ['ovr', 'ovo']
    }

### Reading Data

In [3]:
train_x, test_x, train_y, test_y, label_map  = pf.get_train_test()

In [4]:
#Sample instance
train_x[0][0:200]

['mps',
 'tout',
 'lords',
 'replacement',
 'plan',
 'group',
 'mps',
 'tried',
 'raise',
 'pressure',
 'tony',
 'blair',
 'reform',
 'house',
 'lords',
 'publishing',
 'detailed',
 'blueprint',
 'change',
 'cross',
 'party',
 'group',
 'unveiled',
 'draft',
 'bill',
 'proposing',
 'smaller',
 'second',
 'chamber',
 '70',
 'members',
 'would',
 'elected',
 'mps',
 'peers',
 'failed',
 'agree',
 'reform',
 'since',
 '1999',
 '600',
 'hereditaries',
 'lost',
 'seats',
 'group',
 'says',
 'win',
 'support',
 'removing',
 'last',
 '92',
 'hereditaries',
 'government',
 'postponed',
 'plans',
 'remove',
 'remaining',
 'hereditary',
 'peers',
 'said',
 'unlikely',
 'succeed',
 'opposition',
 'lords',
 'tony',
 'blair',
 'argued',
 'needs',
 'consensus',
 'reforms',
 'suggestions',
 'proposals',
 'changing',
 'least',
 'powers',
 'lords',
 'labour',
 'manifesto',
 'party',
 'group',
 'including',
 'tories',
 'ken',
 'clarke',
 'sir',
 'george',
 'young',
 'labour',
 'robin',
 'cook',
 'tony',

### Embeddings: FastText

In [5]:
embeddings_index = pf.get_fasttext_embeddings()
embeddings_index['test']

array([ 2.470e-02,  1.440e-02,  8.120e-02,  7.820e-02,  1.574e-01,
       -5.940e-02,  1.040e-02,  4.900e-02,  6.830e-02, -1.250e-02,
        4.900e-03, -1.502e-01,  3.550e-02, -2.150e-02, -7.810e-02,
        1.390e-02,  1.590e-02,  8.020e-02,  7.940e-02,  5.510e-02,
       -9.240e-02, -2.400e-03, -1.080e-02,  3.920e-02, -9.550e-02,
       -5.410e-02,  1.295e-01, -4.210e-02, -4.820e-02,  5.940e-02,
       -6.630e-02,  3.570e-02, -4.980e-02, -7.060e-02, -5.330e-02,
       -5.830e-02,  6.570e-02, -4.200e-02, -1.000e-04, -6.680e-02,
        1.077e-01,  4.500e-02,  6.790e-02,  1.968e-01, -1.050e-02,
       -1.794e-01, -1.018e-01,  8.600e-03, -1.510e-02,  3.370e-02,
       -4.600e-02,  4.870e-02, -7.331e-01, -1.660e-02, -1.825e-01,
        1.104e-01, -3.210e-02,  2.130e-02,  1.080e-02, -6.190e-02,
        5.100e-03, -4.690e-02, -1.475e-01,  1.425e-01, -2.120e-01,
       -6.640e-02, -7.300e-03,  8.390e-02,  1.497e-01, -1.082e-01,
       -2.500e-03, -2.200e-03, -1.308e-01, -6.240e-02, -3.580e

In [6]:
#Build doc vectors using embeddings
train_x = pf.create_doc_vecs(train_x, embeddings_index)
test_x = pf.create_doc_vecs(test_x, embeddings_index)

In [7]:
train_x[0]

array([ 2.44389090e-02, -2.09461817e-02, -2.52036349e-03, -7.24436337e-03,
        9.75127253e-03,  3.05818545e-04,  9.72109057e-03,  1.37036363e-02,
        5.48472723e-03, -5.52701815e-02, -3.22396364e-02,  6.87236368e-03,
        2.87156364e-02, -5.25381819e-03, -5.55781818e-03, -2.36421816e-02,
        7.08109101e-03, -2.18967274e-02, -5.78545468e-03, -1.16109091e-02,
       -8.62527277e-02, -8.82181789e-03,  4.55018201e-03, -1.59261814e-02,
       -6.52181832e-03, -5.85890870e-03,  1.25767273e-02,  2.96036366e-02,
       -2.15338181e-02, -7.15949092e-02,  2.16952727e-02, -1.05505455e-02,
       -2.01018178e-03,  1.31280004e-02,  1.46152728e-02,  2.08909076e-03,
       -1.39563636e-02,  2.98440002e-02,  2.41123636e-02, -8.33890901e-03,
        7.00254572e-03,  1.70574547e-02,  1.71621817e-02,  6.07745436e-03,
        3.16556358e-02, -2.54000003e-03,  1.48654529e-03,  2.11858186e-02,
        1.42305453e-02, -9.17781822e-03,  4.18182145e-04,  1.18996370e-02,
       -6.68763637e-01, -

### Modeling

In [8]:
pipeline = Pipeline([
    ("Normalize", Normalizer()),
    ("SVC", GridSearchCV(SVC(), grid_params))])

model, y_preds, accuracy, weighted_f1 = pf.run_model(pipeline, train_x, train_y, test_x, test_y, label_map)
pf.store_results('svm', 'fasttext', model, accuracy, weighted_f1)

ACCURACY %:  0.9597
WEIGHTED F1:  0.9597
               precision    recall  f1-score   support

     business       0.94      0.92      0.93        64
entertainment       0.98      0.97      0.98        61
     politics       0.91      0.91      0.91        47
        sport       1.00      0.99      0.99        73
         tech       0.95      1.00      0.97        53

     accuracy                           0.96       298
    macro avg       0.96      0.96      0.96       298
 weighted avg       0.96      0.96      0.96       298

