In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# for Aleks
%cd 'gdrive'/'MyDrive'/'4th Year'/'MSCI 598'/'NLP Class Project'

/content/gdrive/MyDrive/4th Year/MSCI 598/NLP Class Project


In [None]:
# for Parth
# %cd 'gdrive'/'MyDrive'/'NLP Class Project'

In [None]:
!pip -q install simpletransformers

[K     |████████████████████████████████| 249 kB 5.2 MB/s 
[K     |████████████████████████████████| 1.2 MB 73.5 MB/s 
[K     |████████████████████████████████| 10.1 MB 61.3 MB/s 
[K     |████████████████████████████████| 6.5 MB 51.9 MB/s 
[K     |████████████████████████████████| 43 kB 2.4 MB/s 
[K     |████████████████████████████████| 4.0 MB 54.6 MB/s 
[K     |████████████████████████████████| 325 kB 76.1 MB/s 
[K     |████████████████████████████████| 1.8 MB 55.9 MB/s 
[K     |████████████████████████████████| 77 kB 7.9 MB/s 
[K     |████████████████████████████████| 596 kB 67.3 MB/s 
[K     |████████████████████████████████| 895 kB 39.0 MB/s 
[K     |████████████████████████████████| 144 kB 69.1 MB/s 
[K     |████████████████████████████████| 181 kB 74.3 MB/s 
[K     |████████████████████████████████| 63 kB 2.0 MB/s 
[K     |████████████████████████████████| 136 kB 63.9 MB/s 
[K     |████████████████████████████████| 212 kB 75.7 MB/s 
[K     |█████████████████████

In [None]:
import os
import csv
import pandas as pd
from tqdm import tqdm
import numpy as np

from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel, ClassificationArgs

In [None]:
import random

def get_bodies(path_bodies, is_test=False):
  with open(path_bodies, encoding='utf_8') as fb:  # Body ID,articleBody
    body_dict = {}
    lines_b = csv.reader(fb)
    for i, line in enumerate(tqdm(list(lines_b), ncols=80, leave=False)):
        if i > 0:
            body_id = int(line[0].strip())
            body_dict[body_id] = line[1]
  
  original =  list(body_dict.keys())
  random.shuffle(original)
  shuffled = [(key, body_dict[key]) for key in original]
  random.shuffle(shuffled)

  if not is_test:
    train, validate = np.split(shuffled, [int(len(shuffled)*0.8)])
    train_dict, val_dict = {}, {}
    for i in train:
      train_dict[i[0]] = i[1]
    for i in validate:
      val_dict[i[0]] = i[1]
    return train_dict, val_dict
  else:
    test, _ = np.split(shuffled, [int(len(shuffled))])
    test_dict = {}
    for i in test:
      test_dict[i[0]] = i[1]
    return test_dict, _


def fnc(path_headlines, body_dict):
  map = {'agree': 0, 'disagree':1, 'discuss':2, 'unrelated':3}
  with open(path_headlines, encoding='utf_8') as fh: # Headline,Body ID,Stance
      lines_h = csv.reader(fh)
      h = []
      b = []
      l = []
      body_ids = []
      for i, line in enumerate(tqdm(list(lines_h), ncols=80, leave=False)):
          if i > 0:
              body_id = int(line[1].strip())
              label = line[2].strip()
              # print(str(body_id))
              if label in map and str(body_id) in body_dict:
                  h.append(line[0])
                  l.append(map[line[2]])
                  b.append(body_dict[str(body_id)])
                  body_ids.append(body_id)
  return h, b, l, body_ids

# Directory for competetion directory
data_dir = './fnc-1'

train_bodies_dict, val_bodies_dict = get_bodies(os.path.join(data_dir, 'train_bodies.csv'))
headlines, bodies, stances, _ = fnc(os.path.join(data_dir, 'train_stances.csv'), train_bodies_dict)
list_of_tuples = list(zip(headlines, bodies, stances))
train_df = pd.DataFrame(list_of_tuples, columns=['text_a', 'text_b', 'labels'])

headlines, bodies, stances, _ = fnc(os.path.join(data_dir, 'train_stances.csv'), val_bodies_dict)
list_of_tuples = list(zip(headlines, bodies, stances))
val_df = pd.DataFrame(list_of_tuples, columns=['text_a', 'text_b', 'labels'])

train_stances = pd.Series(train_df['labels']).to_numpy()
val_stances = pd.Series(val_df['labels']).to_numpy()

# Data augmentation
train_df_short = train_df.copy()
for index, row in train_df_short.iterrows():
    row['text_b'] = row['text_b'][:len(row['text_b'])//4]

train_df_combined = pd.concat([train_df, train_df_short])

test_bodies_dict, _ = get_bodies(os.path.join(data_dir, 'competition_test_bodies.csv'), is_test=True)
headlines, bodies, stances, test_body_ids = fnc(os.path.join(data_dir, 'competition_test_stances.csv'), test_bodies_dict)
list_of_tuples = list(zip(headlines, bodies, stances, test_body_ids))
test_df = pd.DataFrame(list_of_tuples, columns=['text_a', 'text_b', 'labels', 'Body ID'])

test_mini_df = test_df.copy()
test_mini_df = test_mini_df.drop(test_mini_df.index[10000:])
test_mini_stances = pd.Series(test_mini_df['labels']).to_numpy()
test_stances = pd.Series(test_df['labels']).to_numpy()



In [None]:
# Fine tune model
import torch
with torch.no_grad():
    torch.cuda.empty_cache()

# Change roberta to bert if training on that is desired    
model = ClassificationModel('roberta', 'roberta-base', num_labels=4, args={
    'learning_rate':1e-5,
    'num_train_epochs': 2,
    'reprocess_input_data': True,
    'overwrite_output_dir': False,
    'output_dir': 'out_roberta_5_epoch_short_25',
    'process_count': 10,
    'train_batch_size': 32,
    'eval_batch_size': 32,
    'max_seq_length': 512,
    'save_steps': -1,
    'save_model_every_epoch': True,
    'fp16': True,
    "no_cache": True
})

# model.train_model(train_df_combined)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

  0%|          | 0/79122 [00:00<?, ?it/s]



Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/19781 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/19781 [00:00<?, ?it/s]

(39562, 0.12097826311332284)

In [None]:
# Load model
model = ClassificationModel(
    "roberta", "out_roberta_5_epoch_short_25", num_labels=4, args={
    'learning_rate':1e-5,
    'num_train_epochs': 4,
    'reprocess_input_data': True,
    'overwrite_output_dir': False,
    'output_dir': 'out_roberta_5_epoch_recovery_3',
    'process_count': 10,
    'train_batch_size': 4,
    'eval_batch_size': 4,
    'max_seq_length': 512,
    'save_steps': -1,
    'save_model_every_epoch': True,
    'fp16': True,
    "no_cache": True
})

# model.train_model(train_df)

In [None]:
_, model_outputs_test, _ = model.eval_model(test_df)

preds_test_test = np.argmax(model_outputs_test, axis=1)

# _, model_outputs_test, _ = model.eval_model(val_df)

# preds_test_val = np.argmax(model_outputs_test, axis=1)

# _, model_outputs_test, _ = model.eval_model(train_df)

# preds_test_train = np.argmax(model_outputs_test, axis=1)

  0%|          | 0/25413 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/6354 [00:00<?, ?it/s]

In [None]:
# Evaluate on val set
# target_df = val_df
# target_labels = val_stances
# preds_test = preds_test_val

# Evaluate on test set
target_df = test_df
target_labels = test_stances
preds_test = preds_test_test

# Evaluate on training set
# target_df = train_df
# target_labels = train_stances
# preds_test = preds_test_train

In [None]:
from sklearn.metrics import f1_score

def calculate_f1_scores(y_true, y_predicted):
    f1_macro = f1_score(y_true, y_predicted, average='macro')
    f1_classwise = f1_score(y_true, y_predicted, average=None, labels=[0, 1, 2, 3])

    print("F1 macro: {:.3f}".format(f1_macro * 100))
    print("F1 agree: {:.3f}".format(f1_classwise[0] * 100))
    print("F1 disagree: {:.3f}".format(f1_classwise[1] * 100))
    print("F1 discuss: {:.3f}".format(f1_classwise[2] * 100))
    print("F1 unrelated: {:.3f}".format(f1_classwise[3] * 100))

calculate_f1_scores(preds_test, target_labels)

F1 macro: 78.168
F1 agree: 72.996
F1 disagree: 54.389
F1 discuss: 86.003
F1 unrelated: 99.284


In [None]:
LABELS = [0, 1, 2, 3]
LABELS_STRING = ["agree", "disagree", "discuss", "unrelated"]
RELATED = [0, 1, 2]

def print_confusion_matrix(cm):
    lines = ['CONFUSION MATRIX: (actual vs predicted)']
    header = "|{:^11}|{:^11}|{:^11}|{:^11}|{:^11}|".format('', *LABELS_STRING)
    line_len = len(header)
    lines.append("-"*line_len)
    lines.append(header)
    lines.append("-"*line_len)
    hit = 0
    total = 0
    for i, row in enumerate(cm):
        hit += row[i]
        total += sum(row)
        lines.append("|{:^11}|{:^11}|{:^11}|{:^11}|{:^11}|".format(LABELS_STRING[i], *row))
        lines.append("-"*line_len)
    lines.append("ACCURACY: {:.3f}".format((hit / total)*100) + "%")
    print('\n'.join(lines))

def score_submission(predicted_labels, target):
    score = 0.0
    cm = [[0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0]]
    for i, (g, t) in enumerate(zip(predicted_labels, target)):
            if g == t:
                score += 0.25
                if g != 3:
                    score += 0.50
            if g in RELATED and t in RELATED:
                score += 0.25

            cm[g][t] += 1
    return score,  cm

fnc_score, cm_test = score_submission(preds_test, target_labels)
best_fnc_score, _ = score_submission(target_labels, target_labels)
print("Score: ", fnc_score, "out of ", best_fnc_score)
print("\nRelative FNC Score: {:.3f}".format(100/best_fnc_score*fnc_score) + "% \n")
print_confusion_matrix(cm_test)

Score:  10498.0 out of  11651.25

Relative FNC Score: 90.102% 

CONFUSION MATRIX: (actual vs predicted)
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |   1430    |    189    |    381    |    15     |
-------------------------------------------------------------
| disagree  |    35     |    316    |    107    |     7     |
-------------------------------------------------------------
|  discuss  |    416    |    164    |   3868    |    83     |
-------------------------------------------------------------
| unrelated |    22     |    28     |    108    |   18244   |
-------------------------------------------------------------
ACCURACY: 93.881%


In [None]:
answer_path = 'roberta_answer/answer25.csv'

In [None]:
# Inference
LABELS = ['agree', 'disagree', 'discuss', 'unrelated']

predicted_comp = [LABELS[int(a)] for a in preds_test_test]

answer = {"Headline" : test_df_with_ids['text_a'], "Body ID" : test_df_with_ids['Body ID'], 'Stance' : predicted_comp}
answer = pd.DataFrame(answer)
answer.to_csv(answer_path, index=False, encoding='utf-8') # From pandas library