# Simple Transformers Model

- Documentation: https://simpletransformers.ai/docs/binary-classification/
- Model Types: https://simpletransformers.ai/docs/classification-specifics/#supported-model-types
- Github: https://github.com/ThilinaRajapakse/simpletransformers
- Tutorials:
    - https://towardsdatascience.com/simple-transformers-introducing-the-easiest-bert-roberta-xlnet-and-xlm-library-58bf8c59b2a3
    - https://medium.com/towards-artificial-intelligence/text-classification-with-simple-transformers-a29d13358135
    - https://towardsdatascience.com/battle-of-the-transformers-electra-bert-roberta-or-xlnet-40607e97aba3

In [None]:
# import packages
import numpy as np
import pandas as pd

# model training
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.metrics import classification_report
from scipy.special import softmax

# for display
from IPython.display import display, HTML

## Load Data

In [None]:
# define path and filenames
path = 'data/new_unagg/'
train = 'train'
val = 'val'
test = 'test'
suffix = '_newpreproc_unagg.csv'

# change this accordingly: 'phrase', 'phrase_lemma', 'phrase_stem'
text_column = 'phrase_stem'
old_new = 'new_preproc_unagg'

In [None]:
# create data dict
data = {}
data_train = {}
data_names = [train, val, test]

for name in data_names:
    # read data
    df = pd.read_csv(path+name+suffix)
    # convert to int type
    df['label'] = df['label'].astype('int32')
    # rename columns - requirement of the simpletransformers package
    df = df.rename({'label': 'labels'}, axis=1)
    df = df.rename({text_column: 'text'}, axis=1)
    # add to data dict
    data[f'{name}_{text_column}'] = df
    # data for training - only 2 columns
    df_train = pd.DataFrame(df[['text', 'labels']])
    # convert -1 labels to 2 so the model recognises it
    df_train['labels'] = df_train.apply(lambda x: int(x.labels), axis=1)
    df_train['labels'] = df_train.apply(lambda x: 2 if x.labels == -1 else x.labels, axis=1)
    data_train[f'{name}_{text_column}'] = df_train

In [None]:
for df_name, df in data.items():
    print(df_name)
    print(df.shape)
    print(df.labels.value_counts())
    display(df.head(3))
    print('\n')

In [None]:
# combine train and val
train_all = pd.concat([data['train_'+text_column], data['val_'+text_column]]) ##
data['train_all_'+text_column] = train_all

# combine train and val of only text and labels
train_all = pd.concat([data_train['train_'+text_column], data_train['val_'+text_column]]) ##
data_train['train_all_'+text_column] = train_all

In [None]:
data_train['train_all_'+text_column]

In [None]:
data_train['train_all_'+text_column].labels.value_counts()

## Custom Model

In [None]:
try:
    # load saved model
    model_args = ClassificationArgs(num_train_epochs=2, learning_rate=5e-5)
    model = ClassificationModel(model_type = 'bert', \
                                model_name = f'saved_models/bert_{old_new}_{text_column}', \
                                args = model_args, use_cuda = False)
except:
    # initialise model
    model_args = ClassificationArgs(num_train_epochs=2, learning_rate = 5e-5, \
                                    output_dir = f'saved_models/bert_{old_new}_{text_column}')
    model = ClassificationModel(model_type = 'bert', \
                                 model_name = 'bert-base-uncased', \
                                 num_labels = 3, \
                                 args = model_args, use_cuda = False)
    # train the model
    model.train_model(data_train['train_all_'+text_column])

# other model_type & model_name combinations
# bert & bert-base-uncased
# electra & google/electra-base-discriminator
# roberta & roberta-base
# distilbert & distilbert-base-cased

## Predict on Test Set

In [None]:
# predict on test set
test_pred, test_raw_outputs = model.predict(data['test_'+text_column].text)

# append prediction to df
test_pred_df = data['test_'+text_column].copy()
test_pred_df['raw_pred'] = test_pred
# convert 2 back to -1
test_pred_df['pred'] = test_pred_df.apply(lambda x: -1 if x['raw_pred'] == 2 else x['raw_pred'], axis=1)

In [None]:
test_results = classification_report(test_pred_df.labels, test_pred_df.pred, output_dict=True)
# save to txt
f = open(f'model_results/bert/custom/{old_new}/test_{text_column}.txt', "w")
f.write( str(test_results) )
f.close()

print(classification_report(test_pred_df.labels, test_pred_df.pred, digits=4))

In [None]:
test_pred_df.labels.value_counts()

In [None]:
test_pred_df.pred.value_counts()

In [None]:
from scipy.special import softmax

# convert raw outputs to probabilities
probabilities = softmax(test_raw_outputs, axis=1)

In [None]:
probabilities

# Prepare Predictions for Stacking

## Load Data

In [None]:
# define path and filenames
path = 'data/stacking_folds/' # DO NOT CHANGE THIS
text_column = 'phrase' # DO NOT CHANGE THIS

# fold_num = 5 # 

# train_fold_names = [f'train{fold_num}', f'fold{fold_num}'] # DO NOT CHANGE THIS
train_fold_names = ['train_all', 'test']
suffix = '.csv'

In [None]:
# create data dict
data = {}
data_train = {}

for name in train_fold_names:
    # read data
    df = pd.read_csv(path+name+suffix)
    # convert to int type
    df['label'] = df['label'].astype('int32')
    # rename columns - requirement of the simpletransformers package
    df = df.rename({'label': 'labels'}, axis=1)
    df = df.rename({text_column: 'text'}, axis=1)
    # add to data dict
    data[f'{name}'] = df
    # data for training - only 2 columns
    df_train = pd.DataFrame(df[['text', 'labels']])
    # convert -1 labels to 2 so the model recognises it
    df_train['labels'] = df_train.apply(lambda x: int(x.labels), axis=1)
    df_train['labels'] = df_train.apply(lambda x: 2 if x.labels == -1 else x.labels, axis=1)
    data_train[f'{name}'] = df_train

In [None]:
for df_name, df in data.items():
    print(df_name)
    print(df.shape)
    print(df.labels.value_counts())
    display(df.head(3))
    print('\n')

## Custom Model

In [None]:
try:
    # load saved model
    model_args = ClassificationArgs(num_train_epochs=2, learning_rate=5e-5)
    model = ClassificationModel(model_type = 'bert', \
                                model_name = f'saved_models/bert_{train_fold_names[-1]}', \
                                args = model_args, use_cuda = False)
except:
    # initialise model
    model_args = ClassificationArgs(num_train_epochs=2, learning_rate = 5e-5, \
                                    output_dir = f'saved_models/bert_{train_fold_names[-1]}')
    model = ClassificationModel(model_type = 'bert', \
                                 model_name = 'bert-base-uncased', \
                                 num_labels = 3, \
                                 args = model_args, use_cuda = False)
    # train the model
    # model.train_model(data_train[f'train{fold_num}'])
    model.train_model(data_train['train_all'])

# other model_type & model_name combinations
# bert & bert-base-uncased
# electra & google/electra-base-discriminator
# roberta & roberta-base
# distilbert & distilbert-base-cased

## Predict on Fold

In [None]:
# predict on test set
# fold_pred, fold_raw_outputs = model.predict(data[f'fold{fold_num}'].text)
fold_pred, fold_raw_outputs = model.predict(data['test'].text) # train on train_all, test on test

# append prediction to df
# fold_pred_df = data[f'fold{fold_num}'].copy()
fold_pred_df = data['test'].copy()
fold_pred_df['raw_pred'] = fold_pred
# convert 2 back to -1
fold_pred_df['pred'] = fold_pred_df.apply(lambda x: -1 if x['raw_pred'] == 2 else x['raw_pred'], axis=1)

In [None]:
from scipy.special import softmax

# convert raw outputs to probabilities
probabilities = softmax(fold_raw_outputs, axis=1)

In [None]:
fold_pred_df['bert_prob_pos'] = probabilities[:, 1]
fold_pred_df['bert_prob_neg'] = probabilities[:, 2]
# fold_pred_df.to_csv(f'stacking_preds/bert/bert_fold{fold_num}_full.csv', index=False)
fold_pred_df.to_csv(f'stacking_preds/bert/bert_test_full.csv', index=False)

In [None]:
fold_preds_only = fold_pred_df[['bert_prob_pos', 'bert_prob_neg']]
# fold_preds_only.to_csv(f'stacking_preds/bert/bert_fold{fold_num}.csv', index=False)
fold_preds_only.to_csv(f'stacking_preds/bert/bert_test.csv', index=False)

In [None]:
print(classification_report(fold_pred_df.labels, fold_pred_df.pred, digits=4))

In [None]:
fold_pred_df.labels.value_counts()

In [None]:
fold_pred_df.pred.value_counts()

In [None]:
fold_preds_only

# Train Final Model

In [None]:
# read data
df = pd.read_csv('data/stacking_folds/ALL_LABELLED_DATA.csv')
# convert to int type
df['label'] = df['label'].astype('int32')
# rename columns - requirement of the simpletransformers package
df = df.rename({'label': 'labels'}, axis=1)
df = df.rename({'phrase': 'text'}, axis=1)
    
df_train = pd.DataFrame(df[['text', 'labels']])
# convert -1 labels to 2 so the model recognises it
df_train['labels'] = df_train.apply(lambda x: int(x.labels), axis=1)
df_train['labels'] = df_train.apply(lambda x: 2 if x.labels == -1 else x.labels, axis=1)

In [None]:
print(df.shape)
df.head()

In [None]:
try:
    # load saved model
    model_args = ClassificationArgs(num_train_epochs=2, learning_rate=5e-5)
    model = ClassificationModel(model_type = 'bert', \
                                model_name = f'saved_models/model_bert_final', \
                                args = model_args, use_cuda = False)
except:
    # initialise model
    model_args = ClassificationArgs(num_train_epochs=2, learning_rate = 5e-5, \
                                    output_dir = f'saved_models/model_bert_final')
    model = ClassificationModel(model_type = 'bert', \
                                 model_name = 'bert-base-uncased', \
                                 num_labels = 3, \
                                 args = model_args, use_cuda = False)
    # train the model
    model.train_model(df_train)

In [None]:
# predict on train (for checking purposes)
train_pred, train_raw_outputs = model.predict(df_train.text)

# append prediction to df
train_pred_df = df.copy()
train_pred_df['raw_pred'] = train_pred
# convert 2 back to -1
train_pred_df['pred'] = train_pred_df.apply(lambda x: -1 if x['raw_pred'] == 2 else x['raw_pred'], axis=1)

print(classification_report(train_pred_df.labels, train_pred_df.pred, digits=4))