## Dowloading Dataset And Importing Required Libraries

In [1]:
!wget https://github.com/google-research-datasets/gap-coreference/raw/master/gap-development.tsv -q
!wget https://github.com/google-research-datasets/gap-coreference/raw/master/gap-test.tsv -q
!wget https://github.com/google-research-datasets/gap-coreference/raw/master/gap-validation.tsv -q

In [2]:
!pip install pytorch-pretrained-bert

Collecting pytorch-pretrained-bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[?25l[K     |██▋                             | 10 kB 19.1 MB/s eta 0:00:01[K     |█████▎                          | 20 kB 20.7 MB/s eta 0:00:01[K     |████████                        | 30 kB 6.8 MB/s eta 0:00:01[K     |██████████▋                     | 40 kB 5.7 MB/s eta 0:00:01[K     |█████████████▎                  | 51 kB 2.8 MB/s eta 0:00:01[K     |███████████████▉                | 61 kB 3.3 MB/s eta 0:00:01[K     |██████████████████▌             | 71 kB 3.8 MB/s eta 0:00:01[K     |█████████████████████▏          | 81 kB 4.3 MB/s eta 0:00:01[K     |███████████████████████▉        | 92 kB 4.6 MB/s eta 0:00:01[K     |██████████████████████████▌     | 102 kB 5.0 MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112 kB 5.0 MB/s eta 0:00:01[K     |███████████████████████████████▊| 122 kB 5.0 MB/s eta 0:00:01[K     |████████████████████████████████| 

In [3]:
import time
import os
import random

import numpy as np
import pandas as pd

import torch
from torch.optim import Adam
from torch.utils.data import Dataset
from torch.nn import Module, Linear, Dropout
import torch.nn.functional as F
from pytorch_pretrained_bert.modeling import BertModel, BertLayer
from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam
from pytorch_pretrained_bert.optimization import WarmupLinearSchedule
from torch.utils.data import DataLoader
from torch.utils.data import RandomSampler

from sklearn.metrics import log_loss
import matplotlib.pyplot as plt

In [4]:
seed = 9876

#seed function is used to generate same random numbers 
#again and again and simplifies algorithm testing process.
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
#reproducibility of the code
#control sources of randomness that can cause multiple 
#executions of your application to behave differently
torch.backends.cudnn.deterministic = True

## Initializing Model And Other Pertaining Values

In [5]:
# Model
bert_model = "bert-base-cased"
n_bertlayers = 12
dropout = 0.1

# Preprocessing
do_lower_case = False

# Training
train_batch_size = 4
#Gradient accumulation is a mechanism to split the batch of samples — used for training a neural network — into 
# several mini-batches of samples that will be run sequentially.
gradient_accumulation_steps = 5
lr = 1e-5
num_train_epochs = 5
#Warm-up is a way to reduce the primacy effect of the early training examples. 
#here i run it vause i am only able to run for 5 epochs, so prevent early overfitting 
#and using mroe epochs to train the data
warmup_proportion = 0.1
optim = "bertadam"
weight_decay = False


# Others
eval_batch_size = 32
device = torch.device("cuda")


## Dataset Handling

In [6]:
def insert_tag(row):
    #Insert custom tags in the sentence find the position of A, B, and the pronoun after tokenization.
    to_be_inserted = sorted([(row["A-offset"], " [A] "),(row["B-offset"], " [B] "),(row["Pronoun-offset"], " [P] ")], key=lambda x: x[0], reverse=True)
    text = row["Text"]
    for offset, tag in to_be_inserted:
        text = text[:int(offset)] + tag + text[int(offset):]
    return text

def tokenize(text, tokenizer):
    #Returns a list of tokens and the positions of A, B, and the Pronoun.
    entries = {}
    final_tokens = []
    for token in tokenizer.tokenize(text):
        if token in ("[A]", "[B]", "[P]"):
            entries[token] = len(final_tokens)
            continue
        final_tokens.append(token)
    return final_tokens, (entries["[A]"], entries["[B]"], entries["[P]"])


class GAPDataset(Dataset):
    #Custom GAP Dataset class
    def __init__(self, df, tokenizer, labeled=True):
        self.labeled = labeled
        if labeled:
            tmp = df[["A-coref", "B-coref"]].copy()
            tmp["Neither"] = ~(df["A-coref"] | df["B-coref"])
            self.y = tmp.values.astype("bool")
        # Extracts the tokens and offsets(positions of A, B, and P)
        self.offsets, self.tokens = [], []
        self.seq_len = []
        for _, row in df.iterrows():
            text = insert_tag(row)
            tokens, offsets = tokenize(text, tokenizer)
            self.offsets.append(offsets)
            self.tokens.append(tokenizer.convert_tokens_to_ids(["[CLS]"] + tokens + ["[SEP]"]))
            self.seq_len.append(len(self.tokens[-1]))
    def __len__(self):
        return len(self.tokens)
    def __getitem__(self, idx):
        if self.labeled:
            return self.tokens[idx], self.offsets[idx], self.y[idx]
        else:
            return self.tokens[idx], self.offsets[idx]
    def get_seq_len(self):
        return self.seq_len


def collate_examples(batch, truncate_len=500):
    """we do batch preparation where, we:
    1. do padding
    2. convert to tensors
    """
    transposed = list(zip(*batch))
    max_len = min(max((len(x) for x in transposed[0])),truncate_len)
    tokens = np.zeros((len(batch), max_len), dtype=np.int64)
    for i, row in enumerate(transposed[0]):
        row = np.array(row[:truncate_len])
        tokens[i, :len(row)] = row
    token_tensor = torch.from_numpy(tokens)
    # Offsets
    offsets = torch.stack([torch.LongTensor(x) for x in transposed[1]], dim=0) + 1 # Account for the [CLS] token
    # Labels
    if len(transposed) == 2:
        return token_tensor, offsets, None
    #create one-hot encodes the outputs 
    #0,0,0 or 1,0,0 or 0,0,1 etc.
    one_hot_labels = torch.stack([torch.from_numpy(x.astype("uint8")) for x in transposed[2]], dim=0)
    _, labels = one_hot_labels.max(dim=1)
    return token_tensor, offsets, labels

## Model

In [7]:
def get_pretrained_bert(modelname, num_hidden_layers=None):
    bert = BertModel.from_pretrained(modelname)
    return bert

class BertCl_GAP(Module):
    #model definition
    def __init__(self, bert, dropout, n_offsets=3):
        super().__init__()
        self.bert = bert
        self.bert_hidden_size = self.bert.config.hidden_size
        self.dropout = Dropout(dropout)
        self.classifier = Linear(self.bert.config.hidden_size * n_offsets, n_offsets)
    #forward propagate
    def forward(self, token_tensor, offsets, label_id=None):
        bert_outputs, _ = self.bert(
            token_tensor, attention_mask=(token_tensor > 0).long(),
            token_type_ids=None, output_all_encoded_layers=False)
        extracted_outputs = bert_outputs.gather(1, offsets.unsqueeze(2).expand(-1, -1, bert_outputs.size(2))).view(bert_outputs.size(0), -1)
        outputs = self.classifier(self.dropout(extracted_outputs))
        return outputs


def run_epoch(model, dataloader, optimizer, criterion, device,verbose_step=10000):
    model.train()
    t1 = time.time()
    tr_loss = 0
    for step, batch in enumerate(dataloader):
        batch = tuple(t.to(device) for t in batch)
        label_ids = batch[-1]
        outputs = model(*batch[:-1])
        if criterion._get_name() == "BCEWithLogitsLoss":
            outputs = outputs[:, 0]
            label_ids = label_ids.float()
        loss = criterion(outputs, label_ids)
        if gradient_accumulation_steps > 1:
            loss = loss / gradient_accumulation_steps
        loss.backward()
        tr_loss += loss.item()
        if (step + 1) % verbose_step == 0:
            loss_now = gradient_accumulation_steps * tr_loss / (step + 1)
            print(f'step:{step+1} loss:{loss_now:.7f} time:{time.time() - t1:.1f}s')
        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            model.zero_grad()
    return gradient_accumulation_steps * tr_loss / (step + 1)


def predict(model, data_loader, device):
    model.eval()
    preds = []
    for step, batch in enumerate(data_loader):
        batch = batch[:2]
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            logits = model(*batch)
            preds.append(logits.detach().cpu())
    preds = torch.cat(preds) if len(preds) > 1 else preds[0]
    if preds.size(-1) > 1:
        preds = F.softmax(preds, dim=1)
    else:
        preds = torch.sigmoid(preds)
    return preds.numpy()

In [8]:
def get_gap_model(bert_model, n_bertlayers, dropout,steps_per_epoch, device):
    bert = get_pretrained_bert(bert_model, n_bertlayers)
#     bert = BertModel.from_pretrained(modelname)
    model = BertCl_GAP(bert, dropout)

    model.to(device)

    param_optimizer = list(model.named_parameters())

    if weight_decay:
        no_decay = ["bias", "gamma", "beta", "head"]
        optimizer_grouped_parameters = [
            {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
             "weight_decay": 0.01},
            {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
             "weight_decay": 0.0}
        ]
    else:
        optimizer_grouped_parameters = [
            {"params": [p for n, p in param_optimizer], "weight_decay": 0.0}

        ]

    t_total = int(
        steps_per_epoch / gradient_accumulation_steps * num_train_epochs)
    if optim == 'bertadam':
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=lr,
                             warmup=warmup_proportion,
                             t_total=t_total)
    return model, optimizer


def get_loader(train_df, val_df, test_df):
    tokenizer = BertTokenizer.from_pretrained(
        bert_model,
        do_lower_case=do_lower_case,
        never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]", "[A]", "[B]", "[P]")
    )
    # These tokens are not actually used, so we can assign arbitrary values.
    tokenizer.vocab['[A]'] = -1
    tokenizer.vocab['[B]'] = -1
    tokenizer.vocab['[P]'] = -1

    train_ds = GAPDataset(train_df, tokenizer)
    val_ds = GAPDataset(val_df, tokenizer)
    test_ds = GAPDataset(test_df, tokenizer, labeled=False)
        
    train_loader = DataLoader(
        train_ds,
        collate_fn=collate_examples,
        batch_size=train_batch_size,
        shuffle=True,
        drop_last=True
    )
    val_loader = DataLoader(
        val_ds,
        collate_fn=collate_examples,
        batch_size=32,
        shuffle=False
    )
    test_loader = DataLoader(
        test_ds,
        collate_fn=collate_examples,
        batch_size=32,
        shuffle=False
    )
    return train_loader, val_loader, test_loader

Loading DataLoaders

In [9]:
# train_df = pd.concat([pd.read_csv(data_dir + "gap-test.tsv", delimiter="\t"),
#                       pd.read_csv(data_dir + "gap-development.tsv", delimiter="\t")])
train_df = pd.read_csv("gap-development.tsv", delimiter="\t")
val_df = pd.read_csv("gap-validation.tsv", delimiter="\t")
val_y = val_df[['A-coref', 'B-coref']].astype(int)
val_y['None'] = 1 - val_y.sum(1)

test_df = pd.read_csv("gap-test.tsv", delimiter="\t")
test_df.drop("A-coref", axis=1, inplace=True)
test_df.drop("B-coref", axis=1, inplace=True)

print(f"Train:{train_df.shape[0]}, Valid:{val_df.shape[0]}, Test:{test_df.shape[0]}")

Train:2000, Valid:454, Test:2000


In [10]:
train_loader, val_loader, test_loader = get_loader(train_df, val_df, test_df)

100%|██████████| 213450/213450 [00:00<00:00, 2442000.98B/s]


In [11]:
steps_per_epoch = len(train_loader)

In [12]:
scores = []
criterion = torch.nn.CrossEntropyLoss()

## Training

In [13]:
model, optimizer = get_gap_model(bert_model, n_bertlayers, dropout,
                                 steps_per_epoch, device)
print("Starting Training \n")
for e in range(num_train_epochs):
    t1 = time.time()
    tr_loss = run_epoch(model, train_loader, optimizer, criterion, device)
    val_pr = predict(model, val_loader, device)
    val_loss = log_loss(val_y, val_pr)
    elapsed = time.time() - t1
    print(f"Epoch:{e + 1} tr_loss:{tr_loss:.4f} val_loss:{val_loss:.4f}"
          f" Time:{elapsed:.1f}s")
    scores.append({"model_id": 1, "epoch": e + 1, "time": elapsed,
                   "tr_loss": tr_loss, "val_loss": val_loss})
test_pr = predict(model, test_loader, device)
torch.save(model, 'model.pth')

# del model, optimizer
# torch.cuda.empty_cache()


100%|██████████| 404400730/404400730 [00:27<00:00, 14842017.63B/s]


Starting Training 



	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:1055.)
  next_m.mul_(beta1).add_(1 - beta1, grad)


Epoch:1 tr_loss:0.7243 val_loss:0.5159 Time:139.5s
Epoch:2 tr_loss:0.2874 val_loss:0.4340 Time:138.5s
Epoch:3 tr_loss:0.0928 val_loss:0.5244 Time:138.1s
Epoch:4 tr_loss:0.0468 val_loss:0.5455 Time:138.5s
Epoch:5 tr_loss:0.0308 val_loss:0.5606 Time:138.4s


In [14]:
model1 = torch.load('model.pth')
test_pr = predict(model, test_loader, device)

## Generating Results

In [15]:
df = pd.DataFrame(scores)

pd.set_option("precision", 5)
print("\nSingle model")
print(df.groupby("epoch")[['tr_loss', 'val_loss']].mean())


Single model
       tr_loss  val_loss
epoch                   
1      0.72429   0.51591
2      0.28744   0.43397
3      0.09281   0.52441
4      0.04677   0.54554
5      0.03082   0.56056


In [16]:
# test_pr_avg /= n_models
# test_pr_avg /= 1
df_sub = pd.DataFrame(test_pr, columns=["A", "B", "NEITHER"])
df_sub["ID"] = test_df.ID
df_sub.to_csv("submission.csv", index=False)


In [17]:
df_sub

Unnamed: 0,A,B,NEITHER,ID
0,0.00038,0.99711,0.00251,test-1
1,0.99862,0.00027,0.00111,test-2
2,0.99726,0.00037,0.00237,test-3
3,0.00214,0.99332,0.00454,test-4
4,0.99842,0.00019,0.00139,test-5
...,...,...,...,...
1995,0.99870,0.00013,0.00118,test-1996
1996,0.99711,0.00068,0.00221,test-1997
1997,0.97686,0.00038,0.02276,test-1998
1998,0.99390,0.00366,0.00244,test-1999


In [18]:
test_data = pd.read_csv("gap-test.tsv", delimiter="\t")
test_data

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,A-coref,B,B-offset,B-coref,URL
0,test-1,Upon their acceptance into the Kontinental Hoc...,His,383,Bob Suter,352,False,Dehner,366,True,http://en.wikipedia.org/wiki/Jeremy_Dehner
1,test-2,"Between the years 1979-1981, River won four lo...",him,430,Alonso,353,True,Alfredo Di St*fano,390,False,http://en.wikipedia.org/wiki/Norberto_Alonso
2,test-3,Though his emigration from the country has aff...,He,312,Ali Aladhadh,256,True,Saddam,295,False,http://en.wikipedia.org/wiki/Aladhadh
3,test-4,"At the trial, Pisciotta said: ``Those who have...",his,526,Alliata,377,False,Pisciotta,536,True,http://en.wikipedia.org/wiki/Gaspare_Pisciotta
4,test-5,It is about a pair of United States Navy shore...,his,406,Eddie,421,True,Rock Reilly,559,False,http://en.wikipedia.org/wiki/Chasers
...,...,...,...,...,...,...,...,...,...,...,...
1995,test-1996,"The sole exception was Wimbledon, where she pl...",She,479,Goolagong Cawley,400,True,Peggy Michel,432,False,http://en.wikipedia.org/wiki/Evonne_Goolagong_...
1996,test-1997,"According to news reports, both Moore and Fily...",her,338,Esther Sheryl Wood,263,True,Barbara Morgan,404,False,http://en.wikipedia.org/wiki/Hastings_Arthur_Wise
1997,test-1998,"In June 2009, due to the popularity of the Sab...",She,328,Kayla,364,True,Natasha Henstridge,412,False,http://en.wikipedia.org/wiki/Raya_Meddine
1998,test-1999,She was delivered to the Norwegian passenger s...,she,305,Irma,255,True,Bergen,274,False,http://en.wikipedia.org/wiki/SS_Irma_(1905)


In [19]:
actual_vals = []
#A=1,B=2,Nei=0
for i in range(len(test_data)):
  if (test_data.loc[i,"A-coref"] == True) and (test_data.loc[i,"B-coref"] == False):
    actual_vals.append(1)
  elif (test_data.loc[i,"A-coref"] == False) and (test_data.loc[i,"B-coref"] == True):
    actual_vals.append(2)
  elif (test_data.loc[i,"A-coref"] == False) and (test_data.loc[i,"B-coref"] == False):
    actual_vals.append(0)
actual_vals

[2,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 2,
 2,
 0,
 2,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 0,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 2,
 2,
 2,
 1,
 2,
 0,
 1,
 2,
 2,
 1,
 2,
 1,
 2,
 0,
 2,
 2,
 2,
 2,
 1,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 2,
 2,
 1,
 2,
 1,
 0,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 2,
 1,
 2,
 0,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 0,
 2,
 2,
 2,
 1,
 1,
 2,
 0,
 1,
 1,
 2,
 2,
 2,
 1,
 1,
 2,
 1,
 0,
 1,
 2,
 1,
 0,
 2,
 0,
 0,
 2,
 1,
 1,
 2,
 0,
 1,
 2,
 2,
 1,
 2,
 0,
 1,
 1,
 2,
 1,
 2,
 2,
 1,
 1,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 0,
 2,
 0,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 2,
 1,
 2,
 2,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 2,
 1,
 2,
 2,
 0,
 1,
 2,
 2,
 0,
 0,
 2,
 2,
 0,
 2,
 0,
 2,
 2,
 2,
 0,
 2,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 2,
 2,
 1,
 2,
 2,
 2,
 1,
 2,
 1,
 1,
 2,
 1,
 2,
 0,
 2,
 1,
 1,
 1,
 2,
 1,
 2,
 2,
 1,
 1,
 2,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 2,
 2,
 1,
 2,
 2,


In [20]:
predicted_vals = []
for i in range(len(df_sub)):
    if (df_sub.loc[i,'A']>=df_sub.loc[i,'B']) and (df_sub.loc[i,'A']>=df_sub.loc[i,'NEITHER']):
        predicted_vals.append(1)
    elif (df_sub.loc[i,'B']>=df_sub.loc[i,'A']) and (df_sub.loc[i,'B']>=df_sub.loc[i,'NEITHER']):
        predicted_vals.append(2)
    elif (df_sub.loc[i,'NEITHER']>=df_sub.loc[i,'A']) and (df_sub.loc[i,'NEITHER']>=df_sub.loc[i,'B']):
        predicted_vals.append(0)
print(len(predicted_vals))

2000


In [21]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

In [22]:
# predicted_vals
# actual_vals
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(actual_vals, predicted_vals)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(actual_vals, predicted_vals, average='weighted')
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(actual_vals, predicted_vals, average='weighted')
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(actual_vals, predicted_vals, average='weighted')
print('F1 score: %f' % f1)
 
# confusion matrix
matrix = confusion_matrix(actual_vals, predicted_vals)
print(matrix)

Accuracy: 0.830500
Precision: 0.830590
Recall: 0.830500
F1 score: 0.828175
[[133  40  54]
 [ 23 765 130]
 [ 20  72 763]]
