In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import random
import torch
import pickle
random.seed(1)
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed(1)

from smaberta import TransformerModel

### Loading Data

Load train data stored in CSV format using Pandas and perform any preprocessing i.e. here we encode labels positive, negative and neutral with 2, 0 and 1. Pretty much any format is acceptable, just some form of text and accompanying labels. Modify according to your task

In [5]:
train_df = pd.read_csv("./data/tutorial_train.csv")
#mapping = {"Positive":2, "Negative":0, "Neutral":1}

#temp = train_df.applymap(lambda s: mapping.get(s) if s in mapping else s)

labels = list(train_df["label"])
texts = list(train_df["text"])
train_data = [[texts[i], labels[i]] for i in range(len(texts))]

train_data = pd.DataFrame(train_data)

Loading test data

In [11]:
test_df = pd.read_csv("./data/tutorial_test.csv")

#test_temp = test_df.applymap(lambda s: mapping.get(s) if s in mapping else s)

labels = list(test_df["label"])
texts = list(test_df["text"])
test_data = [[texts[i], labels[i]] for i in range(len(texts))]
test_data = pd.DataFrame(test_data)

Just to get an idea of what this dataset looks like

In [7]:
train_df.head()

Unnamed: 0,text,label
0,"AIDS in prison, treatment costs overwhelm pris...",12
1,olympics security,19
2,police brutality,12
3,Iranian nuclear program; deal with European Un...,16
4,terror alert raised,16


In [8]:
print(texts[:5], labels[:5])

['AIDS in prison, treatment costs overwhelm prison budgets', 'olympics security', 'police brutality', 'Iranian nuclear program; deal with European Union and its leaving of Iran free to develop plutonium.', 'terror alert raised'] [12, 19, 12, 16, 16]


### Learning Parameters
Alternatively iterate through these to perform grid search or random search CV

In [9]:
lr = 1e-5
epochs = 5
print("Learning Rate ", lr)
print("Train Epochs ", epochs)

Learning Rate  1e-05
Train Epochs  5


### Initialise model
1. First argument is indicative to use the Roberta architecture (alternatives - Bert, XLNet... as provided by Huggingface). Used to specify the right tokenizer and classification head as well 
2. Second argument provides intialisation point as provided by Huggingface [here](https://huggingface.co/transformers/pretrained_models.html). Examples - roberta-base, roberta-large, gpt2-large...
3. Number of labels to initialise the classification head appropriately
4. Pass in training arguments as initialised, along with the output directory where the model is to be saved.

In [10]:
model = TransformerModel('roberta', 'roberta-base', num_labels=25, 
            args={'reprocess_input_data': True, "num_train_epochs":epochs, "learning_rate":lr, 
                  'output_dir':'./saved_model/', 'overwrite_output_dir': True, 'fp16':False})

### Run training

In [11]:
model.train_model(train_data)

Starting Epoch:  0
Starting Epoch:  1
Starting Epoch:  2
Starting Epoch:  3
Starting Epoch:  4
Training of roberta model complete. Saved to ./saved_model/.


To see more in depth logs, set flag show_running_loss=True on the function call of train_model

### Inference from model

Either continue retaining the same object, or load from the directory it was previously saved at. 

In [3]:
model = TransformerModel('roberta', 'roberta-base',  num_labels=25, location="./saved_model/")

### Evaluate on test set

Use the models and perform any analysis on the output before/after saving the same

In [6]:
result, model_outputs, wrong_predictions = model.eval_model(test_data)
preds = np.argmax(model_outputs, axis = 1)

Eval Examples:  998
{'mcc': 0.5141422932729787}


In [7]:
len(test_data), len(preds)

(998, 998)

In [8]:
correct = 0
for i in range(len(labels)):
    if preds[i] == labels[i]:
        correct+=1

accuracy = correct/len(labels)
print("Accuracy: ", accuracy)

Accuracy:  0.5801603206412825


In [10]:
pickle.dump(model_outputs, open("../model_outputs.pkl", "wb"))

### Run inference 

Alternatively just predict on a set of new text documents without loading a pandas datafram

In [22]:
preds, model_outputs = model.predict(texts)

In [23]:
correct = 0
for i in range(len(labels)):
    if preds[i] == labels[i]:
        correct+=1

accuracy = correct/len(labels)
print("Accuracy: ", accuracy)

Accuracy:  0.5801603206412825
