# Setting Up

In [None]:
# import packages
import numpy as np
import pandas as pd

In [None]:
# read data
path = "data/"

csvs = ["all_train.csv", "all_test.csv", "news_test.csv", "reddit_test.csv", "twitter_test.csv"]

data = {}

for csv in csvs:
    df = pd.read_csv(path+csv, header=0)
    # extract relevant columns
    df = pd.DataFrame(df[['text', 'label']])
    # rename columns - requirement of the simpletransformers package
    df = df.rename({'label': 'labels'}, axis=1)
    # check shape of df
    print("\nShape: ", df.shape)
    # check null
    print("Check null: ", df.isnull().sum().sum())
    # check head
    display(df.head(3))
    # add to data dict
    data[csv] = df

# Simple Transformers Model
- Documentation: https://simpletransformers.ai/docs/binary-classification/ 
- Model Types: https://simpletransformers.ai/docs/classification-specifics/#supported-model-types
- Github: https://github.com/ThilinaRajapakse/simpletransformers
- Tutorials: 
    - https://towardsdatascience.com/simple-transformers-introducing-the-easiest-bert-roberta-xlnet-and-xlm-library-58bf8c59b2a3
    - https://medium.com/towards-artificial-intelligence/text-classification-with-simple-transformers-a29d13358135
    - https://towardsdatascience.com/battle-of-the-transformers-electra-bert-roberta-or-xlnet-40607e97aba3

In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.metrics import classification_report

In [None]:
# initialise Model
model_args = ClassificationArgs(num_train_epochs=2, learning_rate = 5e-5)
model = ClassificationModel(model_type = 'distilbert', model_name = 'distilbert-base-cased', \
                            args = model_args, use_cuda = False)

# other model_type & model_name combinations
# bert & bert-base-cased
# bert & textattack/bert-base-uncased-yelp-polarity
# electra & google/electra-base-discriminator
# roberta & roberta-base
# bert & bert-base-uncased
# distilbert & distilbert-base-cased

In [None]:
# train the model
model.train_model(data['all_train.csv'])

In [None]:
# dataframe to store results
results = pd.DataFrame(columns = ['test_set', 'precision', 'recall', 'f1'])

# evaluation on test sets
for csv in csvs[1:]:
    test_df = data[csv]
    # evaluate the model
    # result, model_outputs, wrong_predictions = model.eval_model(test_df)
    
    # get predictions
    pred, raw_outputs = model.predict(test_df['text'])
    
    # flip it for Yelp (0 negative and 1 positive)
    # pred_flipped = (pred - 1) * (-1)
    
    # classification report
    report = classification_report(y_true = test_df['labels'], y_pred = pred, output_dict = True)
    
    row = {'test_set': csv, 'precision': report['1']['precision'], \
           'recall': report['1']['recall'], 'f1': report['1']['f1-score']}
    
    print(row)
    
    results = results.append(row, ignore_index = True)

In [None]:
# loading saved models
# under model names: input path to a directory containing model files
# must specify the same args as model initialisation
model = ClassificationModel(model_type = 'bert', model_name = 'bert/outputs/', args = model_args)