# Classification using BERT embeddings

## Loading data

In [1]:
import pandas as pd

# The train and test data should have a column named cat1, containing the labels of the data
# The corresponding embeddings should be in this folder
train_data = pd.read_csv('../data/formatted/train_data.csv')
test_data = pd.read_csv('../data/formatted/test_data.csv')

train_data_labels = list(train_data['cat1'])
test_data_labels = list(test_data['cat1'])

In [2]:
# Reading the train data embeddings
training_data_embeddings = []
with open('../data/embedding/train_data_embeddings') as vecs_file:
    for index, line in enumerate(vecs_file):
        training_data_embeddings.append(list(map(float, line.split())))

In [3]:
# Reading the test data embeddings
test_data_embeddings = []
with open('../data/embedding/test_data_embeddings') as vecs_file:
    for index, line in enumerate(vecs_file):
        test_data_embeddings.append(list(map(float, line.split())))

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lm = LogisticRegression(solver='lbfgs',max_iter=10000, multi_class='multinomial', verbose=2)
lm.fit(training_data_embeddings, train_data_labels)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [17]:
from sklearn.metrics import classification_report

prediction_labels = lm.predict(test_data_embeddings)
print(classification_report(list(test_data_labels), list(prediction_labels)))

              precision    recall  f1-score   support

           1       0.82      0.81      0.82     21674
           2       0.63      0.73      0.68     28604
          12       0.75      0.70      0.73     20714
          38       0.72      0.66      0.69     16511
          67       0.82      0.84      0.83     31741
          79       0.49      0.28      0.35     11070
         125       0.79      0.88      0.84     21342
         143       0.97      0.98      0.97     46593
         151       0.59      0.27      0.37       472
         191       0.84      0.78      0.81      1279

   micro avg       0.79      0.79      0.79    200000
   macro avg       0.74      0.69      0.71    200000
weighted avg       0.79      0.79      0.79    200000



## XGBoost

In [4]:
import xgboost as xgb
import numpy as np

### Training an XGBoost Classifier using BERT embeddings

In [5]:
clf = xgb.XGBClassifier(
    max_depth=2,
    n_estimators=10,
    objective='multi:softmax',
    n_jobs=30,
    silent=False,
    num_class=10
)

In [None]:
clf.fit(np.array(training_data_embeddings), np.array(train_data_labels))

In [7]:
clf.save_model('xgb_model')

XGBoostError: need to call fit or load_model beforehand

### Testing the embeddings

In [None]:
# Predicting
prediction_labels = clf.predict(test_data_embeddings)

In [1]:
from sklearn.metrics import classification_report
classification_report(test_data_labels, list(prediction_labels))


                precision    recall  f1-score   support

           1       0.77      0.73      0.75     21674
           2       0.53      0.72      0.61     28604
          12       0.74      0.58      0.65     20714
          38       0.71      0.57      0.63     16511
          67       0.73      0.78      0.76     31741
          79       0.49      0.18      0.26     11070
         125       0.76      0.85      0.80     21342
         143       0.93      0.96      0.94     46593
         151       0.72      0.10      0.18       472
         191       0.83      0.56      0.67      1279

   micro avg       0.74      0.74      0.74    200000
   macro avg       0.72      0.60      0.63    200000
weighted avg       0.74      0.74      0.73    200000

