# Simple Transformers Model

- Documentation: https://simpletransformers.ai/docs/binary-classification/
- Model Types: https://simpletransformers.ai/docs/classification-specifics/#supported-model-types
- Github: https://github.com/ThilinaRajapakse/simpletransformers
- Tutorials:
    - https://towardsdatascience.com/simple-transformers-introducing-the-easiest-bert-roberta-xlnet-and-xlm-library-58bf8c59b2a3
    - https://medium.com/towards-artificial-intelligence/text-classification-with-simple-transformers-a29d13358135
    - https://towardsdatascience.com/battle-of-the-transformers-electra-bert-roberta-or-xlnet-40607e97aba3

In [None]:
# import packages
import numpy as np
import pandas as pd

# model training
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.metrics import classification_report
from scipy.special import softmax

# for display
from IPython.display import display, HTML

## Load Data

In [None]:
# define path and filenames
path = 'data/labelled_data/'
train = 'train'
val = 'val'
test = 'test'
suffix = '_newpreproc_emoticon.csv'

In [None]:
# create data dict
data = {}
data_names = [train, val, test]
# change this accordingly: 'phrase', 'phrase_lemma', 'phrase_stem'
text_column = 'phrase'
# old_new = 'new_preproc'

for name in data_names:
    # read data
    df = pd.read_csv(path+name+suffix)
    df['label'] = df['label'].astype('int32')
    # rename columns - requirement of the simpletransformers package
    df = df.rename({'label': 'labels'}, axis=1)
    df = df.rename({text_column: 'text'}, axis=1)
    # add to data dict
    data[f'{name}_{text_column}'] = df

In [None]:
for df_name, df in data.items():
    print(df_name)
    print(df.shape)
    print(df.labels.value_counts())
    display(df.head(3))
    print('\n')

## Baseline: Yelp Polarity Dataset

In [None]:
# initialise model
model_baseline_args = ClassificationArgs(num_train_epochs=2, learning_rate = 5e-5)
model_baseline = ClassificationModel(model_type = 'bert', \
                                     model_name = 'textattack/bert-base-uncased-yelp-polarity', \
                                     args = model_baseline_args, use_cuda = False)

### Predict on Validation

- To find the best threshold to classify phrases into 3 categories: -1, 0, 1

In [None]:
# predict on val set
val_pred, val_raw_outputs = model_baseline.predict(data['val_'+text_column].text)

# append prediction and output to df
val_pred_df = data['val_'+text_column].copy()
val_pred_df['raw_pred'] = val_pred
for i in range(len(val_pred_df)):
    val_pred_df.loc[i, 'raw_output_0'] = val_raw_outputs[i][0]
    val_pred_df.loc[i, 'raw_output_1'] = val_raw_outputs[i][1]
    
# get probabilities (note 0 means negative and 1 means positive)
val_prob = softmax(val_raw_outputs, axis=1)

val_prob_positive = [x[1] for x in val_prob]
val_pred_df['prob_pos'] = val_prob_positive

In [None]:
# function that assigns class labels based on prob (positive sentiment)
def label_from_prob(x, lower, upper):    
    if x < lower: # negative sentiment
        return -1
    elif x < upper: # neutral sentiment
        return 0
    else:
        return 1

In [None]:
# grid search for best threshold
lower_lst = [round(i*0.05, 2) for i in range(1, 9)]
upper_lst = [round(1 - i, 2) for i in lower_lst]

# dataframe to store results
val_threshold_results = pd.DataFrame(columns = ['lower_thresh', 'upper_thresh', 'accuracy', \
                                                'weighted_ave_f1', 'f1_neg', 'f1_zero', 'f1_pos'])

for i in range(len(lower_lst)):
    df = val_pred_df.copy()
    # assign labels based on threshold definition
    lower = lower_lst[i]
    upper = upper_lst[i]
    df['pred'] = df.apply(lambda x: label_from_prob(x.prob_pos, lower, upper), axis=1)
    
    # classification report
    report = classification_report(df.labels, df.pred, output_dict=True)
    # retrieve metrics
    accuracy = report['accuracy']
    weighted_ave_f1 = report['weighted avg']['f1-score']
    f1_neg = report['-1']['f1-score']
    f1_zero = report['0']['f1-score']
    f1_pos = report['1']['f1-score']
    
    row = {'lower_thresh': lower, 'upper_thresh': upper, 'accuracy': accuracy, \
           'weighted_ave_f1': weighted_ave_f1, 'f1_neg': f1_neg, 'f1_zero': f1_zero, 'f1_pos': f1_pos}
    
    val_threshold_results = val_threshold_results.append(row, ignore_index=True)

In [None]:
val_threshold_results.to_csv(f'model_results/bert/baseline/val_threshold_{text_column}.csv')
val_threshold_results

### Apply Best Threshold on Test Set

In [None]:
best_row = val_threshold_results.loc[val_threshold_results['weighted_ave_f1'] == max(val_threshold_results['weighted_ave_f1'])]
best_lower = best_row['lower_thresh'][0]
best_upper = best_row['upper_thresh'][0]

In [None]:
# predict on test set
test_pred, test_raw_outputs = model_baseline.predict(data['test_'+text_column].text)

# append prediction and output to df
test_pred_df = data['test_'+text_column].copy()
test_pred_df['raw_pred'] = test_pred
for i in range(len(test_pred_df)):
    test_pred_df.loc[i, 'raw_output_0'] = test_raw_outputs[i][0]
    test_pred_df.loc[i, 'raw_output_1'] = test_raw_outputs[i][1]
    
# get probabilities (note 0 means negative and 1 means positive)
test_prob = softmax(test_raw_outputs, axis=1)

test_prob_positive = [x[1] for x in test_prob]
test_pred_df['prob_pos'] = test_prob_positive

In [None]:
test_pred_df['pred'] = test_pred_df.apply(lambda x: label_from_prob(x['prob_pos'], best_lower, best_upper), axis=1)

In [None]:
test_results = classification_report(test_pred_df.labels, test_pred_df.pred, output_dict=True)
# save to txt
f = open(f'model_results/bert/baseline/test_{text_column}.txt', "w")
f.write( str(test_results) )
f.close()

print(classification_report(test_pred_df.labels, test_pred_df.pred, digits=4))

In [None]:
test_pred_df.labels.value_counts()

In [None]:
test_pred_df.pred.value_counts()