# XGBoost (GloVe Embeddings)

### Libraries

In [1]:
import process_funcs as pf


import pandas as pd
import numpy as np

import xgboost as xgb
from xgboost import XGBClassifier
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

### Parameters

In [2]:
grid_params = {
    'gamma': [0.5, 1.5],
    'max_depth': [3, 5],
    'sampling_method': ['uniform', 'gradient_based']
    }

### Reading Data

In [3]:
train_x, test_x, train_y, test_y, label_map  = pf.get_train_test()

In [4]:
#Sample instance
train_x[0][0:200]

['mps',
 'tout',
 'lords',
 'replacement',
 'plan',
 'group',
 'mps',
 'tried',
 'raise',
 'pressure',
 'tony',
 'blair',
 'reform',
 'house',
 'lords',
 'publishing',
 'detailed',
 'blueprint',
 'change',
 'cross',
 'party',
 'group',
 'unveiled',
 'draft',
 'bill',
 'proposing',
 'smaller',
 'second',
 'chamber',
 '70',
 'members',
 'would',
 'elected',
 'mps',
 'peers',
 'failed',
 'agree',
 'reform',
 'since',
 '1999',
 '600',
 'hereditaries',
 'lost',
 'seats',
 'group',
 'says',
 'win',
 'support',
 'removing',
 'last',
 '92',
 'hereditaries',
 'government',
 'postponed',
 'plans',
 'remove',
 'remaining',
 'hereditary',
 'peers',
 'said',
 'unlikely',
 'succeed',
 'opposition',
 'lords',
 'tony',
 'blair',
 'argued',
 'needs',
 'consensus',
 'reforms',
 'suggestions',
 'proposals',
 'changing',
 'least',
 'powers',
 'lords',
 'labour',
 'manifesto',
 'party',
 'group',
 'including',
 'tories',
 'ken',
 'clarke',
 'sir',
 'george',
 'young',
 'labour',
 'robin',
 'cook',
 'tony',

### Embeddings: GloVe

In [5]:
embeddings_index = pf.get_glove_embeddings()
embeddings_index['test']

array([-5.8342e-01,  4.8631e-01,  7.4230e-01,  1.7875e-01, -1.5873e+00,
       -3.7499e-01,  3.7902e-01,  7.0767e-01, -1.5402e+00,  7.4851e-01,
       -6.9018e-03, -4.4981e-01, -1.1125e-01,  1.7395e-01,  2.6005e-01,
        2.7065e-01,  7.8731e-01,  8.4877e-01, -9.9277e-02,  1.7688e-02,
        4.4389e-01, -8.7300e-01,  4.9640e-01, -1.8430e-01,  2.4894e-01,
        2.4073e-01,  8.4460e-02,  7.0786e-02, -2.3216e-01,  2.6685e-01,
       -2.3519e-01,  4.5211e-01, -3.9982e-01,  2.4936e-01,  7.3548e-01,
       -7.2352e-02, -8.1008e-01, -1.5256e-01, -1.0313e+00,  2.3067e-01,
       -1.1634e+00,  2.0387e-01,  4.2369e-01, -1.0589e+00,  2.9905e-01,
        2.0036e-01,  6.3371e-01, -5.7507e-01, -4.3730e-01, -5.5908e-01,
        5.0811e-01,  3.2673e-01, -4.3710e-01,  1.0702e+00, -1.0418e-01,
       -2.4432e+00, -1.0739e+00,  1.0881e-01,  1.5446e+00, -1.5633e-01,
       -3.7779e-01,  1.5111e-01,  5.8397e-01,  5.4980e-01,  2.3775e-01,
        8.8690e-01, -1.0220e-01, -4.1313e-02, -1.9496e-01, -1.37

In [6]:
#Build doc vectors using embeddings
train_x = pf.create_doc_vecs(train_x, embeddings_index)
test_x = pf.create_doc_vecs(test_x, embeddings_index)

In [7]:
train_x[0]

array([-1.94632342e-01,  1.17435557e-01,  2.29419819e-01, -1.86716018e-01,
        1.34396525e-01,  1.99695025e-02, -1.60269121e-01,  3.28238387e-02,
       -1.67631354e-01, -5.12015900e-02, -3.70541413e-02, -4.74471735e-02,
        9.44325364e-02, -1.57141492e-02, -9.20029029e-02, -2.72415726e-01,
        6.85131115e-03, -3.74372389e-02, -3.88483376e-01,  9.19017244e-02,
        1.90124373e-01, -1.48356169e-01,  1.83016901e-01, -2.05767882e-01,
       -1.28358913e-01, -3.46939932e-01, -2.77133482e-02, -3.44619876e-01,
        1.31586809e-01,  8.55594383e-03,  1.43579483e-01,  4.21037387e-01,
       -2.95240892e-02, -1.02253168e-01, -3.60065740e-04,  2.48176318e-01,
        1.16891988e-01,  1.32472380e-01, -2.46243883e-01,  3.98082501e-03,
       -5.55756161e-01, -2.30402363e-01,  4.50221278e-01, -1.99466872e-01,
       -5.48977220e-02, -2.35516431e-01,  3.43716664e-03, -3.00909484e-01,
       -1.90913338e-01, -4.34730730e-01,  7.85542536e-02, -1.78140641e-01,
        9.04847001e-02,  

### Modeling

In [8]:
pipeline = Pipeline([
    ("XGB", GridSearchCV(XGBClassifier(use_label_encoder = False,
                                        eval_metric = 'mlogloss'),
                                        grid_params))])

model, y_preds, accuracy, weighted_f1 = pf.run_model(pipeline, train_x, train_y, test_x, test_y, label_map)
pf.store_results('xgb', 'glove', model, accuracy, weighted_f1)

ACCURACY %:  0.9497
WEIGHTED F1:  0.9498
               precision    recall  f1-score   support

     business       0.94      0.91      0.92        64
entertainment       0.95      0.98      0.97        61
     politics       0.88      0.91      0.90        47
        sport       0.99      0.97      0.98        73
         tech       0.98      0.96      0.97        53

     accuracy                           0.95       298
    macro avg       0.95      0.95      0.95       298
 weighted avg       0.95      0.95      0.95       298

