In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import pickle
import csv
import time
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
device = 'cuda'

In [None]:
# Load in the data
EID = 0
YEAR = 1
KEYWORDS = 2
TITLE = 3
ABSTRACT = 4
IDR = 5
AF = 6

data_file = 'scopus_UK_author_idr'

UK_data = None
with open('Datasets/{}.csv'.format(data_file), encoding='utf-8') as f:
    r = csv.reader(f)
    h = next(r)
    UK_data = [line for line in r]
    
def process_row(row):
    return '. '.join((row[TITLE], row[KEYWORDS], ' '.join(row[AF].split(maxsplit=50)[:50]), row[ABSTRACT]))

In [None]:
# Experiment 1: train on full dataset
Y = torch.LongTensor([int(row[IDR]) for row in UK_data])
X = [process_row(row) for row in UK_data]

model_tr_name = 'SciBert_full'

In [None]:
# Experiment 2: train on university
uni = 'Oxford'  # 'Oxford', 'Cambridge', 'UCL'
with open('Datasets/{}_eids.csv'.format(uni), 'r') as f:
    r = csv.reader(f)
    h = next(r)
    eids = {int(row[0]) for row in r}

X, Y = [], []
for row in UK_data:
    eid = int(row[EID])
    if not eid in eids:
        continue
    X.append(process_row(row))
    Y.append(int(row[IDR]))

model_tr_name = 'SciBert_{}'.format(uni)

In [None]:
# Experiment 3: train on time period
ranges = [(2011, 2015), (2016, 2018), (2019, 2023)]
rng = [2]
X, Y = [], []
for row in UK_data:
    yr = int(row[YEAR])
    if yr >= ranges[rng[0]][0] and yr <= ranges[rng[-1]][-1]:
        X.append(process_row(row))
        Y.append(int(row[IDR]))

model_tr_name = 'SciBert_{}-{}'.format(ranges[rng[0]][0], ranges[rng[-1]][-1])

In [None]:
current_model_path = 'Models/{}'.format(model_tr_name)

# Load from finetuned
# model = AutoModelForSequenceClassification.from_pretrained(current_model_path, num_labels=2).to(device)

# Load from checkpoint
# model = AutoModelForSequenceClassification.from_pretrained('Training/checkpoint-', num_labels=2).to(device)

# Load from base
model = AutoModelForSequenceClassification.from_pretrained('allenai/scibert_scivocab_uncased', num_labels=2).to(device)


# Freeze the pretrained BERT layers
for name, param in model.named_parameters():
	if 'classifier' not in name:
		param.requires_grad = False

tokeniser = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

In [None]:
# Tokenise the training set and create the data loaders
try:
    with open('Embeddings/{}{}_txt_enc'.format(model_tr_name, '_1-{}'.format(undersample_ratio) if undersample_ratio else ''), 'rb') as f:
        print('Reading encodings... ', end='')
        tr_enc, t_enc, tr_Y, t_Y = pickle.load(f)
    print('Done')

except:
    if undersample_ratio:
        X, Y = undersample(undersample_ratio, X, Y)
        print('New ratio:', len(Y)/sum(Y), len(Y))
    print('Building encodings... ', end='')
    tr_X, t_X, tr_Y, t_Y = train_test_split(X, Y, test_size=0.1, random_state=21)
    lengths_sorted = np.argsort([len(x.split()) for x in tr_X])  # Sort data by length for efficient batch padding during training
    tr_X = np.take(tr_X, lengths_sorted).tolist()
    tr_Y = np.take(tr_Y, lengths_sorted).tolist()
    tr_enc = tokeniser(tr_X, truncation=True, max_length=512)
    t_enc  = tokeniser(t_X, truncation=True, max_length=512)
    with open('Embeddings/{}{}_txt_enc'.format(model_tr_name, '_1-{}'.format(undersample_ratio) if undersample_ratio else ''), 'wb') as f:
        pickle.dump((tr_enc, t_enc, tr_Y, t_Y), f)
    print('Done')    

class IDRDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


training_loader = IDRDataset(tr_enc, tr_Y)
test_loader = IDRDataset(t_enc, t_Y)

In [None]:
tr_args = TrainingArguments(
        output_dir='Training',
        overwrite_output_dir=True,
        num_train_epochs=6,
        fp16=True,
        per_device_train_batch_size=64, # <4GB of VRAM
        per_device_eval_batch_size=64,
        warmup_steps=50,
        weight_decay=0.005,
        evaluation_strategy='epoch',
        eval_steps=800,
        save_strategy='epoch',
        save_steps=len(X)//4,
        save_total_limit=1)

In [None]:
r = min((sum(tr_Y) / len(tr_Y)), 0.999) # Class imbalance ratio
weighted_CE = nn.CrossEntropyLoss(weight=torch.Tensor([1/((1-r)*2), 1/(r*2)]).to(model.device))  # Weighted CE using sklearn's class imbalance formula
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs['labels']
        outputs = model(**inputs)
        logits = outputs['logits']
        loss = weighted_CE(logits.view(-1, 2), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss


trainer = WeightedTrainer(
    model=model,
    args=tr_args,
    tokenizer=tokeniser,
    train_dataset=training_loader,
    eval_dataset=test_loader)

In [None]:
trainer.train()

trainer.save_model(current_model_path)