## Task 1 (English) System \#3 - Pretrained BERT Model

### 1. Preprocessing

Read in data:

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import fast_bert
import logging
import warnings
import torch

warnings.filterwarnings("ignore")
np.random.seed(42)

train = pd.read_csv('drive/My Drive/Misc/CLEF/data/training.tsv', sep='\t', header=0).drop(['tweet_url', 'topic_id', 'claim'], axis=1)
dev = pd.read_csv('drive/My Drive/Misc/CLEF/data/dev.tsv', sep='\t', header=0).drop(['tweet_url', 'topic_id', 'claim'], axis=1)

train.insert(0, 'index', [x for x in range(len(train))])
dev.insert(0, 'index', [x for x in range(len(dev))])

train.head()

Unnamed: 0,index,tweet_id,tweet_text,claim_worthiness
0,0,1234964653014384644,Since this will never get reported by the medi...,1
1,1,1234869939720216578,"Thanks, #MichaelBloomberg. Here’s a handy litt...",0
2,2,1234873136304267267,"Folks, when you say ""The corona virus isn't a ...",0
3,3,1235071285027147776,Just 1 case of Corona Virus in India and peop...,1
4,4,1234911110861594624,President @realDonaldTrump made a commitment...,1


Format data for insertion into train/val .csv files for BERT model.

In [3]:
train_bert = train.drop(['tweet_id'], axis=1)
val_bert = dev.drop(['tweet_id'], axis=1)

train_bert.columns = ['index', 'text', 'label']
val_bert.columns = ['index', 'text', 'label']

train_bert['label'] = np.where(train_bert['label'] == 1, 'pos', 'neg')
val_bert['label'] = np.where(val_bert['label'] == 1, 'pos', 'neg')

train_bert.to_csv('drive/My Drive/Misc/CLEF/bert_data/train.csv', index=False)
val_bert.to_csv('drive/My Drive/Misc/CLEF/bert_data/val.csv', index=False)

train_bert.head()

Unnamed: 0,index,text,label
0,0,Since this will never get reported by the medi...,pos
1,1,"Thanks, #MichaelBloomberg. Here’s a handy litt...",neg
2,2,"Folks, when you say ""The corona virus isn't a ...",neg
3,3,Just 1 case of Corona Virus in India and peop...,pos
4,4,President @realDonaldTrump made a commitment...,pos


### 2. Modelling

Use fast-bert library to implement BERT-based classification model.

In [0]:
databunch = fast_bert.data_cls.BertDataBunch('drive/My Drive/Misc/CLEF/bert_data', 'drive/My Drive/Misc/CLEF/bert_data',
                                              tokenizer='bert-base-uncased',
                                              train_file='train.csv',
                                              val_file='val.csv',
                                              label_file='labels.csv',
                                              text_col='text',
                                              label_col='label',
                                              batch_size_per_gpu=64,
                                              max_seq_length=128,
                                              multi_gpu=False,
                                              multi_label=False,
                                              model_type='bert')

In [0]:
metrics = [{'name': 'accuracy', 'function': fast_bert.metrics.accuracy}]
logger = logging.getLogger()

learner = fast_bert.learner_cls.BertLearner.from_pretrained_model(databunch,
                                                                    pretrained_path='bert-base-uncased',
                                                                    metrics=metrics,
                                                                    device=torch.device('cuda'),
                                                                    logger=logger,
                                                                    output_dir='drive/My Drive/Misc/CLEF/bert_data',
                                                                    finetuned_wgts_path=None,
                                                                    warmup_steps=200,
                                                                    multi_gpu=False,
                                                                    is_fp16=False,
                                                                    multi_label=False,
                                                                    logging_steps=50)

In [12]:
learner.fit(epochs=2,
			lr=5e-3,
			validate=False, 	# Evaluate the model after each epoch
			schedule_type="warmup_cosine",
			optimizer_type="lamb")

learner.save_model()

### 3. Prediction

Using the trained model from above to predict check-worthiness.

In [0]:
from fast_bert.prediction import BertClassificationPredictor

predictor = BertClassificationPredictor(model_path='drive/My Drive/Misc/CLEF/bert_data/model_out',
                                        label_path='drive/My Drive/Misc/CLEF/bert_data', # location for labels.csv file
                                        multi_label=False,
                                        model_type='bert',
                                        do_lower_case=False)

preds = predictor.predict_batch(val_bert['text'].to_list())

In [0]:
preds_proba = []
for x in preds:
    if x[0][0] == 'pos':
        preds_proba.append(x[0][1])
    else:
        preds_proba.append(x[1][1])

In [0]:
results = pd.DataFrame(columns=['topic_id', 'tweet_id', 'score', 'run_id'])
results['tweet_id'] = dev['tweet_id']
results['score'] = preds_proba
results['topic_id'] = 'covid-19'
results['run_id'] = 'Model_3'

In [16]:
results

Unnamed: 0,topic_id,tweet_id,score,run_id
0,covid-19,1235714275752267776,0.764376,Model_3
1,covid-19,1235256530728972290,0.249506,Model_3
2,covid-19,1235648554338791427,0.011354,Model_3
3,covid-19,1235674258858061825,0.881780,Model_3
4,covid-19,1235663306246860800,0.552009,Model_3
...,...,...,...,...
145,covid-19,1235914080931766274,0.017861,Model_3
146,covid-19,1235770706765451264,0.866983,Model_3
147,covid-19,1235973416995315712,0.262141,Model_3
148,covid-19,1235675024738185239,0.695539,Model_3


In [0]:
results.to_csv('drive/My Drive/Misc/CLEF/golf_system_results_3.tsv', sep='\t', header=False, index=False)



```
python3 scorer/main.py --gold_file_path="./data/dev.tsv" --pred_file_path="./golf_system_results_3.tsv"
INFO : Started evaluating results for Task 5 ...
INFO : Reading gold predictions from file ./data/dev.tsv
INFO : Reading predicted ranking order from file ./golf_system_results_3.tsv
INFO : ======================================== RESULTS for golf_system_results_3.tsv =========================================
INFO : AVERAGE PRECISION:            0.7850    
INFO : ========================================================================================================================
INFO : RECIPROCAL RANK:              1.0000    
INFO : ========================================================================================================================
INFO : R-PRECISION (R=72):           0.7222    
INFO : ========================================================================================================================
INFO : PRECISION@N:                  @1        @3        @5        @10       @20       @50       
INFO :                               1.0000    1.0000    1.0000    0.9000    0.9000    0.7800    
```

