# Setting Up

In [None]:
# import packages
import numpy as np
import pandas as pd

# optional text processing
from utils.text_processing import text_processing

# model training
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.metrics import classification_report
from scipy.special import softmax

## Loading Train-Test Data

In [None]:
# read data
path = "data/train_test/"
csvs = ["all_train.csv", "all_test.csv", "news_test_entity.csv", "reddit_test_entity.csv", "twitter_test.csv"]
data = {}

for csv in csvs:
    df = pd.read_csv(path+csv, header=0)
    # extract relevant columns
    df = pd.DataFrame(df[['text', 'label']])
    
    # processing text column (where necessary)
    df['text'] = df.apply(lambda x: text_processing(x.text,                     
                                                    lower=False, 
                                                    remove_url=True, 
                                                    remove_punctuation=False, 
                                                    remove_stopwords=False, 
                                                    replace_entity=True, 
                                                    replace_hash=True,
                                                    split_alphanumeric=False,
                                                    lemmatize=False,
                                                    stem=False), axis=1)
    
    # rename columns - requirement of the simpletransformers package
    df = df.rename({'label': 'labels'}, axis=1)
    # check shape of df
    print("\nShape: ", df.shape)
    # check null
    print("Check null: ", df.isnull().sum().sum())
    # check head
    display(df.head(3))
    # add to data dict
    data[csv] = df

## Loading 2020 Data for Prediction

In [None]:
# read data
path = "data/2020/"

csvs = ["2020_conventional.csv", "2020_crypto.csv", "2020_reddit.csv", "2020_twitter.csv"]

data = {}

for csv in csvs:
    df = pd.read_csv(path+csv, header=0)
    # extract relevant columns
    if csv == "2020_reddit.csv":
        df = pd.DataFrame(df[['title', 'excerpt']])
        df['text'] = df['title']
    elif csv == "2020_twitter.csv":
        df = pd.DataFrame(df[['tweet']])
        df.columns = ['text']
    else:
        df = pd.DataFrame(df[['title', 'excerpt']])
        df['text'] = df['title'] + '' + df['excerpt']
    
    # check shape of df
    print("\nShape: ", df.shape)
    # check null
    print("Check null: ", df.isnull().sum().sum())
    # check head
    display(df.head(3))
    # add to data dict
    data[csv] = df[['text']]

# Simple Transformers Model
- Documentation: https://simpletransformers.ai/docs/binary-classification/ 
- Model Types: https://simpletransformers.ai/docs/classification-specifics/#supported-model-types
- Github: https://github.com/ThilinaRajapakse/simpletransformers
- Tutorials: 
    - https://towardsdatascience.com/simple-transformers-introducing-the-easiest-bert-roberta-xlnet-and-xlm-library-58bf8c59b2a3
    - https://medium.com/towards-artificial-intelligence/text-classification-with-simple-transformers-a29d13358135
    - https://towardsdatascience.com/battle-of-the-transformers-electra-bert-roberta-or-xlnet-40607e97aba3

In [None]:
# initialise Model
model_args = ClassificationArgs(num_train_epochs=2, learning_rate = 5e-5, \
                                output_dir='models/bert/outputs')
model = ClassificationModel(model_type = 'roberta', model_name = 'roberta-base', \
                            args = model_args, use_cuda = False)

# other model_type & model_name combinations
# bert & bert-base-cased
# bert & bert-base-uncased
# bert & textattack/bert-base-uncased-yelp-polarity
# electra & google/electra-base-discriminator
# roberta & roberta-base
# bert & bert-base-uncased
# distilbert & distilbert-base-cased

In [None]:
# train the model
model.train_model(data['all_train.csv'])

In [None]:
# loading saved models
# under model names: input path to a directory containing model files
# must specify the same args as model initialisation
model_args = ClassificationArgs(num_train_epochs=2, learning_rate = 5e-5)
model = ClassificationModel(model_type = 'roberta', model_name = 'models/bert/outputs_roberta_hash_entity_url/', \
                            args = model_args, use_cuda = False)

## Test Set Prediction

In [None]:
# dataframe to store results
results = pd.DataFrame(columns = ['test_set', 'precision', 'recall', 'f1'])

# evaluation on test sets
for csv in csvs[1:]:
    test_df = data[csv]
    # evaluate the model
    # result, model_outputs, wrong_predictions = model.eval_model(test_df)
    
    # get predictions
    pred, raw_outputs = model.predict(test_df['text'])
    
    # append prediction (0 or 1) and probability (prob) to original dataframe
    df = pd.read_csv(path+csv)
    df['pred'] = pred
    for i in range(len(df)):
        df.loc[i, 'raw_output_0'] = raw_outputs[i][0]
        df.loc[i, 'raw_output_1'] = raw_outputs[i][1]
        
    probabilities = softmax(raw_outputs, axis=1)
    probabilities_1 = [x[1] for x in probabilities]
    df['prob'] = probabilities_1
    
    # export
    df.to_csv("data/predicted_bert/roberta_base_"+csv, index=False)
    
    # flip it for Yelp (0 negative and 1 positive)
    # pred_flipped = (pred - 1) * (-1)
    
    # classification report
    report = classification_report(y_true = test_df['labels'], y_pred = pred, output_dict = True)
    
    row = {'test_set': csv, 'precision': report['1']['precision'], \
           'recall': report['1']['recall'], 'f1': report['1']['f1-score']}
    
    print(row)
    
    results = results.append(row, ignore_index = True)

## 2020 Data Predictions

In [None]:
# evaluation on 2020 data
for csv in csvs:
    test_df = data[csv]
    # evaluate the model
    # result, model_outputs, wrong_predictions = model.eval_model(test_df)
    
    # get predictions
    pred, raw_outputs = model.predict(test_df['text'])
    
    # append prediction (0 or 1) and probability (prob) to original dataframe
    df = pd.read_csv(path+csv)
    df['pred'] = pred
    for i in range(len(df)):
        df.loc[i, 'raw_output_0'] = raw_outputs[i][0]
        df.loc[i, 'raw_output_1'] = raw_outputs[i][1]
        
    probabilities = softmax(raw_outputs, axis=1)
    probabilities_1 = [x[1] for x in probabilities]
    df['text'] = test_df['text']
    df['prob'] = probabilities_1
    
    # export
    df.to_csv(path+"bert/2020_bert_"+csv, index=False)