In [None]:
# data source
! git clone https://github.com/VinAIResearch/COVID19Tweet.git
! pip install simpletransformers

import numpy as np 
import pandas as pd
import json
import os
from sklearn.metrics import classification_report
from simpletransformers.classification import ClassificationModel
import pandas as pd
import logging


In [None]:
# if gpu was to be used fp16 should be set true in args file
# we should also install apex


# INSTALL APEX for running fp16
%%writefile apex.sh
export CUDA_HOME=/usr/local/cuda-10.1
git clone https://github.com/NVIDIA/apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

In [None]:
# install apex using the script we just created
!sh apex.sh

# loading task data

In [None]:
train_df = pd.read_csv("COVID19Tweet/train.tsv", sep='\t')
val_df = pd.read_csv("COVID19Tweet/valid.tsv", sep='\t',names=['Id','Text','Label'])
test_df = pd.read_csv("COVID19Tweet/unlabeled_test_with_noise.tsv", sep='\t',names=['Id','Text'])


train_sentences = train_df.Text.values
train_labels =  train_df.Label.values

val_sentences = val_df.Text.values
val_labels =  val_df.Label.values


test_sentences = test_df.Text.values
# test_labels =  val_df.Label.values

y_train = [int(label == 'INFORMATIVE') for label in train_labels]
y_val = [int(label == 'INFORMATIVE') for label in val_labels]

y_train = np.array(y_train)
y_val = np.array(y_val)

# simple transformers preparing and model args

In [None]:
# model name and type should be provided when creating a classification model

model_type = 'bert'
# model_type = 'albert'

model_name =  "bert-base-uncased"
# model_name = "digitalepidemiologylab/covid-twitter-bert"
# model_name = "albert-base-v2"

args = {'reprocess_input_data': True,"learning_rate": 1e-5,"save_steps":99999999999,
        'overwrite_output_dir': True,"output_dir": "outputs/","num_train_epochs":10,
        "max_seq_length":128,"fp16":True,"evaluate_during_training": True,
        "evaluate_during_training_verbose": True,"use_cached_eval_features": True}


In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)


# Create a ClassificationModel
model = ClassificationModel(model_type, model_name, num_labels=2, args=args)

# model training

In [None]:
# Train the model
model.train_model(train_df[['Text',"int_label"]],eval_df=val_df[['Text',"int_label"]],args=args)

# eval on validation set

In [None]:
predictions, raw_outputs = model.predict(val_df['Text'])

preds_bert = np.argmax(raw_outputs,axis=1)

print(classification_report(y_val, preds_bert,digits=6))


# saving bert prob outputs in CSV to use later in Ensemble 

In [None]:
val_results = pd.DataFrame()

raw_outputs = [(probs[0],probs[1]) for probs in raw_outputs]

val_results["bert_probs"] = raw_outputs
# val_results["ct_bert_probs"] = raw_outputs
# val_results["albert_probs"] = raw_outputs

