**Install packages**

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 7.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 69.6 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 65.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


In [None]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 6.8 MB/s 
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 53.9 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 65.9 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 28.1 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 56.9 MB/s 
Installing collected packages: urllib3,

**Pre-processing**

In [None]:
import pandas as pd

# Train = train data, Dev = test data
# Go through each line in each file and split by tab into columns

X_train = pd.read_csv('WIC/train/train.data.txt', sep = '\t', names = ['word', 'tense', 'index', 'sent 1', 'sent 2']) # variables
y_train = pd.read_csv('WIC/train/train.gold.txt', sep = '\t', names = ['labels']) # labels

X_test = pd.read_csv('WIC/dev/dev.data.txt', sep = '\t', names = ['word', 'tense', 'index', 'sent 1', 'sent 2']) # variables
y_test = pd.read_csv('WIC/dev/dev.gold.txt', sep = '\t', names = ['labels']) # labels

X_train

Unnamed: 0,word,tense,index,sent 1,sent 2
0,carry,V,2-1,You must carry your camping gear .,Sound carries well over water .
1,go,V,2-6,Messages must go through diplomatic channels .,Do you think the sofa will go through the door ?
2,break,V,0-2,Break an alibi .,The wholesaler broke the container loads into ...
3,cup,N,8-4,He wore a jock strap with a metal cup .,Bees filled the waxen cups with honey .
4,academy,N,1-2,The Academy of Music .,The French Academy .
...,...,...,...,...,...
5423,krona,N,4-8,Piecas kronas — five krona .,Kronas kurss — the exchange rate of the krona .
5424,conflict,N,3-1,The harder the conflict the more glorious the ...,The conflict between the government and the re...
5425,answer,V,0-0,Answer the riddle .,Answer a question .
5426,play,V,0-0,Play the casinos in Trouville .,Play the races .


In [None]:
# Convert the index column to 2 index columns (1 for each sentence)
X_train = pd.concat([X_train, X_train['index'].str.split('-', expand=True)], axis=1)
X_test = pd.concat([X_test, X_test['index'].str.split('-', expand=True)], axis=1)

# Drop the index column
X_train = X_train.drop(columns=['index'])
X_test = X_test.drop(columns=['index'])

# Rename the last 2 columns to index1 and index2
X_train = X_train.rename(columns={0:'index1', 1: 'index2'})
X_test = X_test.rename(columns={0:'index1', 1: 'index2'})

X_train

Unnamed: 0,word,tense,sent 1,sent 2,index1,index2
0,carry,V,You must carry your camping gear .,Sound carries well over water .,2,1
1,go,V,Messages must go through diplomatic channels .,Do you think the sofa will go through the door ?,2,6
2,break,V,Break an alibi .,The wholesaler broke the container loads into ...,0,2
3,cup,N,He wore a jock strap with a metal cup .,Bees filled the waxen cups with honey .,8,4
4,academy,N,The Academy of Music .,The French Academy .,1,2
...,...,...,...,...,...,...
5423,krona,N,Piecas kronas — five krona .,Kronas kurss — the exchange rate of the krona .,4,8
5424,conflict,N,The harder the conflict the more glorious the ...,The conflict between the government and the re...,3,1
5425,answer,V,Answer the riddle .,Answer a question .,0,0
5426,play,V,Play the casinos in Trouville .,Play the races .,0,0


In [None]:
# Make index1 and index2 numbers int type
X_train['index1'] = X_train['index1'].astype(int)
X_train['index2'] = X_train['index2'].astype(int)
X_test['index1'] = X_test['index1'].astype(int)
X_test['index2'] = X_test['index2'].astype(int)

X_train

Unnamed: 0,word,tense,sent 1,sent 2,index1,index2
0,carry,V,You must carry your camping gear .,Sound carries well over water .,2,1
1,go,V,Messages must go through diplomatic channels .,Do you think the sofa will go through the door ?,2,6
2,break,V,Break an alibi .,The wholesaler broke the container loads into ...,0,2
3,cup,N,He wore a jock strap with a metal cup .,Bees filled the waxen cups with honey .,8,4
4,academy,N,The Academy of Music .,The French Academy .,1,2
...,...,...,...,...,...,...
5423,krona,N,Piecas kronas — five krona .,Kronas kurss — the exchange rate of the krona .,4,8
5424,conflict,N,The harder the conflict the more glorious the ...,The conflict between the government and the re...,3,1
5425,answer,V,Answer the riddle .,Answer a question .,0,0
5426,play,V,Play the casinos in Trouville .,Play the races .,0,0


In [None]:
# Replace T or F with 1 or 0 (labels)
y_train = y_train.replace('F', 0).replace('T', 1)
y_test = y_test.replace('F', 0).replace('T', 1)

# Concatenate the variables and the labels
X_train = pd.concat([X_train, y_train], axis = 1)
X_test = pd.concat([X_test, y_test], axis = 1)

X_train

Unnamed: 0,word,tense,sent 1,sent 2,index1,index2,labels
0,carry,V,You must carry your camping gear .,Sound carries well over water .,2,1,0
1,go,V,Messages must go through diplomatic channels .,Do you think the sofa will go through the door ?,2,6,0
2,break,V,Break an alibi .,The wholesaler broke the container loads into ...,0,2,0
3,cup,N,He wore a jock strap with a metal cup .,Bees filled the waxen cups with honey .,8,4,1
4,academy,N,The Academy of Music .,The French Academy .,1,2,0
...,...,...,...,...,...,...,...
5423,krona,N,Piecas kronas — five krona .,Kronas kurss — the exchange rate of the krona .,4,8,1
5424,conflict,N,The harder the conflict the more glorious the ...,The conflict between the government and the re...,3,1,1
5425,answer,V,Answer the riddle .,Answer a question .,0,0,1
5426,play,V,Play the casinos in Trouville .,Play the races .,0,0,1


**Preparation**

In [None]:
from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
# tokenizer = AutoTokenizer.from_pretrained("roberta-base")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(X_train)
test_dataset = Dataset.from_pandas(X_test)

In [None]:
def tokenize_function(example):
    return tokenizer(
        example['sent 1'], 
        example['sent 2'], 
        padding = 'max_length', 
        max_length = 50, 
        truncation = True
    )

In [None]:
train_dataset_tokenized = train_dataset.map(tokenize_function, batched = True)
test_dataset_tokenized = test_dataset.map(tokenize_function, batched = True)

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
train_dataset_tokenized = train_dataset_tokenized.remove_columns(['word', 'tense', 'sent 1', 'sent 2'])
test_dataset_tokenized = test_dataset_tokenized.remove_columns(['word', 'tense', 'sent 1', 'sent 2'])

In [None]:
train_dataset_tokenized

Dataset({
    features: ['index1', 'index2', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 5428
})

In [None]:
# [CLS] (101) + sentence 1 + [SEP] (102) + sentence 2 + [SEP] (102) + ...
# Max length is 50 tokens


train_dataset_tokenized['input_ids'][0]

[101,
 2017,
 2442,
 4287,
 2115,
 13215,
 6718,
 1012,
 102,
 2614,
 7883,
 2092,
 2058,
 2300,
 1012,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

**Fine-tuning**

In [None]:
import torch
from torch import nn
from transformers import AutoModel


# Main class for the model
class WiCModel(nn.Module):


  def __init__(self):
    super(WiCModel, self).__init__()
    self.num_labels = 2 # True or False
    self.base_model = AutoModel.from_pretrained("bert-base-uncased", num_labels = 2) 
    self.lin1 = nn.Linear(2 * 768, self.num_labels)
    self.softmax = nn.Softmax()


  def forward(self, labels, input_ids, attention_mask, index1, index2):

    logits = self.base_model(
      input_ids = input_ids,
      attention_mask = attention_mask)
    
    toklist = []
    # clst = logits[1]
    logits = logits[0]

    for num, i in enumerate(input_ids):
      first = 0
      for j in range(50):
        if i[j] == 102:
          first = j
          break
        else:
          continue

      templist = []
      # templist.append(clst[num].tolist())
      templist.append(logits[num][index1[num]].tolist())
      templist.append(logits[num][first + index2[num]].tolist())

      toklist.append(templist)

    toklist = torch.tensor(toklist).cuda()
    toklist = toklist.view(-1, 2 * 768)

    outputs = self.lin1(toklist)
    outputs = self.softmax(outputs)

    loss_fct = nn.CrossEntropyLoss()
    loss = loss_fct(outputs.view(-1, self.num_labels), labels.view(-1))

    outputs = {
        'loss': loss,
        'outputs': outputs,
        'labels': labels
    }

    return outputs

# Set the model
model = WiCModel()

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from torch.utils.data import DataLoader

# Set the dataset formats to torch
train_dataset_tokenized.set_format('torch')
test_dataset_tokenized.set_format('torch')

train_dataloader = DataLoader(train_dataset_tokenized, shuffle = True, batch_size = 8) # train data
eval_dataloader = DataLoader(test_dataset_tokenized, batch_size = 8) # test data

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir = "./results", evaluation_strategy = "epoch")

In [None]:
from torch.optim import AdamW
from transformers import get_scheduler

optimizer = AdamW(model.parameters(), lr = 2e-5)

num_epochs = 10
num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    name = "linear", 
    optimizer = optimizer, 
    num_warmup_steps = 0,
    num_training_steps = num_training_steps
)

device = torch.device("cuda") 
model.to(device)

WiCModel(
  (base_model): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_feat

In [None]:
import warnings
warnings.filterwarnings('ignore')

from tqdm.auto import tqdm
from datasets import load_metric

progress_bar = tqdm(range(num_training_steps))
metric = load_metric("accuracy")

for epoch in range(num_epochs):

    model.train()

    running_loss = 0.0
    count = 0

    for batch in train_dataloader:
      batch = {k: v.to(device) for k, v in batch.items()}
      outputs = model(**batch)
      loss = outputs['loss']
      loss.backward()

      optimizer.step()
      lr_scheduler.step()
      optimizer.zero_grad()
      progress_bar.update(1)

      running_loss += loss.item()
      count += 1

    model.eval()

    for batch in eval_dataloader:
      batch = {k: v.to(device) for k, v in batch.items()}
      with torch.no_grad():
          outputs = model(**batch)
      logits = outputs['outputs']
      predictions = torch.argmax(logits, dim = 1)
      metric.add_batch(predictions=predictions, references=batch["labels"])

    avg_loss = running_loss / count
    acc = metric.compute()['accuracy']
    print(f"Epoch: {epoch+1}", f"Loss: {avg_loss:.6f}", f"Accuracy: {acc:.6f}")

  0%|          | 0/6790 [00:00<?, ?it/s]

Epoch: 1 Loss: 0.691834 Accuracy: 0.520376
Epoch: 2 Loss: 0.686082 Accuracy: 0.543887
Epoch: 3 Loss: 0.681785 Accuracy: 0.570533
Epoch: 4 Loss: 0.678186 Accuracy: 0.575235
Epoch: 5 Loss: 0.676459 Accuracy: 0.581505
Epoch: 6 Loss: 0.674621 Accuracy: 0.579937
Epoch: 7 Loss: 0.673720 Accuracy: 0.578370
Epoch: 8 Loss: 0.672139 Accuracy: 0.575235
Epoch: 9 Loss: 0.671962 Accuracy: 0.578370
Epoch: 10 Loss: 0.671203 Accuracy: 0.579937
