In [2]:
! pip install torch
! pip install transformers
! pip install tqdm
! pip install pandas
! pip install torchmetrics
! pip install -U sentence-transformers
! pip install wandb

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m61.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m88.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import tqdm
import pandas as pd
from torchmetrics.regression import PearsonCorrCoef
from sentence_transformers import SentenceTransformer, util, InputExample, losses, models, evaluation

### Loading the data

In [4]:
def load_data(file_path):
    data = pd.read_table(file_path)
    # check if any missing values
    print(data.isnull().sum())
    key = data.keys()
    # some values were missing in sentence2 column, so did the below (sentence1 didnt split properly)
    # iterate through the rows in dataframe which have missing values
    for index, row in data[data.isnull().any(axis=1)].iterrows():
        if pd.isnull(row[key[2]]):
            if(len(row[key[1]].split('\t')) > 2 or len(row[key[1]].split('\t')) < 2):
                data.drop(index, inplace=True)
                continue
            # split the sentence1 into words into 2 parts based on \t and assign to sentence1 and sentence2
            sentence1, sentence2 = row[key[1]].split('\t')
            score = row[key[0]]
            # assign to the row
            data.at[index, key[1]] = sentence1
            data.at[index, key[2]] = sentence2
            data.at[index, key[0]] = score
    #rescale every score in data from 0-5 to 0-1
    data[key[0]] = data[key[0]]/5
    return data

In [5]:
train_data = load_data('/kaggle/input/dataset1/train.csv')
valid_data = load_data('/kaggle/input/dataset1/dev.csv')

score        0
sentence1    0
sentence2    5
dtype: int64
score        0
sentence1    0
sentence2    2
dtype: int64


In [6]:
valid_data.tail(10)

Unnamed: 0,score,sentence1,sentence2
1460,0.4,New UN peacekeeping chief named for Central Af...,UN takes over peacekeeping in Central African ...
1461,1.0,Oil falls in Asian trade,Oil prices down in Asian trade
1462,0.6,Israeli forces detain Palestinian MP in Hebron,Israeli forces detain 2 Palestinians in overni...
1463,0.6,Israeli police clash with Palestinian proteste...,Israel Police Clash With Palestinians in Jerus...
1464,0.0,"3 killed, 4 injured in Los Angeles shootings",Five killed in Saudi Arabia shooting
1465,0.4,Scientists prove there is water on Mars,Has Nasa discovered water on Mars?
1466,0.0,Pranab stresses need to strive for peace by na...,WTO: India regrets action of developed nations
1467,0.4,Volkswagen skids into red in wake of pollution...,"Volkswagen's ""gesture of goodwill"" to diesel o..."
1468,0.0,Obama is right: Africa deserves better leadership,Obama waiting for midterm to name attorney gen...
1469,0.0,New video shows US police officers beating men...,New York police officer critically wounded in ...


In [7]:
print(train_data.shape)
print(train_data['score'].value_counts())

(5709, 3)
0.0000    367
0.8000    351
0.6000    308
1.0000    265
0.7600    263
         ... 
0.0134      1
0.1454      1
0.3400      1
0.3556      1
0.8660      1
Name: score, Length: 139, dtype: int64


### Defining device variable

In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda:0


### Task 1A: using BERT to perform regression

### Creating a dataset class

In [9]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, data, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['sentence1'] + ' [SEP] ' + self.data.iloc[idx]['sentence2']
        inputs = self.tokenizer(text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors='pt')
        inputs['labels'] = torch.tensor(self.data.iloc[idx]['score'], dtype=torch.float)
        inputs = {key: inputs[key].squeeze() for key in inputs}
        return inputs

In [10]:
max_length = 50

In [11]:
# Load pre-trained model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)
sep_token = '[SEP]'

# Load pre-trained model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

### Defining the dataset and dataloader class:

In [12]:
train_dataset = TextDataset(tokenizer, train_data, max_length)
valid_dataset = TextDataset(tokenizer, valid_data, max_length)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=8, shuffle=False)

In [13]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
pearson = PearsonCorrCoef()

### wandb setup

In [14]:
import wandb
wandb.login(relogin=True)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [15]:
model_config = dict(
    task = 1,
    part = 'A',
    model_name = 'bert-base-uncased',
    max_length = 50,
    batch_size = 8,
    learning_rate = 1e-5,
    optimizer = 'Adam',
    criterion = 'MSELoss',
    epochs = 5
)

In [16]:
wandb.init(project='assignment-3', entity= 'nlp-assignments', config=model_config)

[34m[1mwandb[0m: Currently logged in as: [33msahil21091[0m ([33mnlp-assignments[0m). Use [1m`wandb login --relogin`[0m to force relogin


### Defining the model architecture:

In [17]:
class Model(torch.nn.Module):
    def __init__(self, model):
        super(Model, self).__init__()
        self.model = model

    def forward(self, input_ids, attention_mask, labels):
        outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
        return outputs.loss, outputs.logits

model = Model(model)
model.to(device)


Model(
  (model): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=Tru

### Defining train and evaluation loops

In [18]:
def train(model, loader, optimizer, epochs=1, valid_loader=None):
    wandb.define_metric('epoch')
    wandb.define_metric('training_loss', step_metric='epoch')
    wandb.define_metric('validation_loss', step_metric='epoch')
    wandb.define_metric('Pearson Correlation', step_metric='epoch')
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm.tqdm(loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            optimizer.zero_grad()
            loss, preds = model(input_ids, attention_mask, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch: {epoch + 1}, Training loss: {total_loss / len(loader)}')
        epoch_log = {}
        if(valid_loader is not None):
            loss, corr = evaluate(model, valid_loader)
            epoch_log['validation_loss'] = loss
            epoch_log['Pearson Correlation'] = corr
        epoch_log['epoch'] = epoch
        epoch_log['training_loss'] = total_loss / len(loader)
        wandb.log(epoch_log)
    return total_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    total_loss = 0
    predicted = torch.tensor([])
    all_labels = torch.tensor([])
    with torch.no_grad():
        for batch in tqdm.tqdm(loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            loss, preds = model(input_ids, attention_mask, labels)
            total_loss += loss.item()
            for i in preds:
                i = i.cpu()
                predicted = torch.cat((predicted, i))
            for i in labels:
                i = i.cpu()
                i = torch.tensor([i])
                all_labels = torch.cat((all_labels, i))
    print(f'Validation loss: {total_loss / len(loader)}, Pearson correlation: {pearson(predicted, all_labels)}')
    return total_loss / len(loader), pearson(predicted, all_labels).item()

In [19]:
train(model, train_loader, optimizer, epochs=5, valid_loader=valid_loader)

100%|██████████| 714/714 [01:23<00:00,  8.57it/s]


Epoch: 1, Training loss: 0.04768054821940677


100%|██████████| 184/184 [00:06<00:00, 29.74it/s]


Validation loss: 0.02687793135081175, Pearson correlation: 0.8407338857650757


100%|██████████| 714/714 [01:16<00:00,  9.36it/s]


Epoch: 2, Training loss: 0.019748767115259205


100%|██████████| 184/184 [00:05<00:00, 31.46it/s]


Validation loss: 0.02523041725140976, Pearson correlation: 0.8524817824363708


100%|██████████| 714/714 [01:16<00:00,  9.33it/s]


Epoch: 3, Training loss: 0.01027514559610718


100%|██████████| 184/184 [00:05<00:00, 31.53it/s]


Validation loss: 0.024894531813713358, Pearson correlation: 0.8560506105422974


100%|██████████| 714/714 [01:15<00:00,  9.41it/s]


Epoch: 4, Training loss: 0.005286838914103368


100%|██████████| 184/184 [00:05<00:00, 30.81it/s]


Validation loss: 0.02519602989195846, Pearson correlation: 0.8613743782043457


100%|██████████| 714/714 [01:15<00:00,  9.42it/s]


Epoch: 5, Training loss: 0.0030093413181673063


100%|██████████| 184/184 [00:06<00:00, 29.93it/s]

Validation loss: 0.024523513141067942, Pearson correlation: 0.8615360856056213





0.0030093413181673063

In [20]:
wandb.finish()

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Pearson Correlation,▁▅▆██
epoch,▁▃▅▆█
training_loss,█▄▂▁▁
validation_loss,█▃▂▃▁

0,1
Pearson Correlation,0.86154
epoch,4.0
training_loss,0.00301
validation_loss,0.02452


In [21]:
torch.save(model.state_dict(), '1A_model.pt')

In [None]:
# #load data from sample_demo.csv
# test_data = load_data('/kaggle/input/nlp-a3/sample_demo.csv')
# test_dataset = TextDataset(tokenizer, test_data, max_length)
# test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8, shuffle=False)

In [None]:
# model.load_state_dict(torch.load('1A_model.pt'))
# evaluate(model, test_loader)

### Part 1B

In [None]:
from sentence_transformers import SentenceTransformer, models
# Create the Sentence Transformer model
model = SentenceTransformer('distilbert-base-nli-mean-tokens')


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.02k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
class ValidationDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence1 = self.data.iloc[idx, 1]
        sentence2 = self.data.iloc[idx, 2]
        score = torch.tensor(self.data.iloc[idx, 0], dtype=torch.float32)
        return [sentence1, sentence2], score

train_dataset2 = ValidationDataset(train_data)
valid_dataset2 = ValidationDataset(valid_data)

train_loader2 = torch.utils.data.DataLoader(train_dataset2, batch_size=8, shuffle=True)
valid_loader2 = torch.utils.data.DataLoader(valid_dataset2, batch_size=8, shuffle=False)


In [None]:
class SentenceDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence1 = self.data.iloc[idx, 1]
        sentence2 = self.data.iloc[idx, 2]
        score = torch.tensor(self.data.iloc[idx, 0], dtype=torch.float32)
        return InputExample(texts=[sentence1, sentence2], label=score)

train_dataset = SentenceDataset(train_data)
valid_dataset = SentenceDataset(valid_data)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=8, shuffle=False)

model.to(device)
loss = losses.CosineSimilarityLoss(model)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

def validation(model, valid_loader):
    model.eval()
    all_scores = []
    all_targets = []
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm.tqdm(valid_loader):
            sentences, targets = batch
            for i in range(len(targets)):
                sentence1_features = model.encode(sentences[0][i], convert_to_tensor=True).to(device)
                sentence2_features = model.encode(sentences[1][i], convert_to_tensor=True).to(device)
                score = util.pytorch_cos_sim(sentence1_features, sentence2_features)
                all_scores.append(score.item())
                all_targets.append(targets[i])
                loss = abs(targets[i] - score.item())**2
                total_loss += loss
    return pearson(torch.tensor(all_scores), torch.tensor(all_targets)), total_loss/len(all_targets)

output1b = validation(model, valid_loader2)
print(output1b[0].item())

100%|██████████| 184/184 [00:23<00:00,  7.78it/s]

0.7919643521308899





### Part 1C

In [None]:
def evaluation():
    wandb.login(relogin=True)
    model_config = dict(
        task = 1,
        part = 'C',
        model_name = 'distilbert-base-nli-mean-tokens',
        max_length = 50,
        batch_size = 8,
        learning_rate = 2e-5,
        optimizer = 'AdamW',
        criterion = 'CosineSimilarityLoss',
        epochs = 2
    )
    wandb.init(project='assignment-3', entity= 'nlp-assignments', config=model_config)
    wandb.define_metric('epoch')
    wandb.define_metric('training_loss', step_metric='epoch')
    wandb.define_metric('validation_loss', step_metric='epoch')
    wandb.define_metric('Pearson Correlation', step_metric='epoch')
    for i in range(2):
        model.fit(train_objectives=[(train_loader, loss)], epochs=1, optimizer_params={'lr': 2e-5}, optimizer_class=torch.optim.AdamW)
        train_pearson, train_loss = validation(model, train_loader2)
        valid_pearson, valid_loss = validation(model, valid_loader2)
        print(f'Epoch {i + 1} - Train Loss: {train_loss}, Train Pearson: {train_pearson}, Valid Loss: {valid_loss}, Valid Pearson: {valid_pearson}')
        wandb.log({'Pearson Correlation': valid_pearson.item(), 'training_loss': train_loss.item(), 'validation_loss': valid_loss.item(), 'epoch': i})
    torch.save(model.state_dict(), '1C_model.pt')
    wandb.finish()

In [None]:
evaluation()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize

wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/714 [00:00<?, ?it/s]

100%|██████████| 714/714 [01:32<00:00,  7.71it/s]

100%|██████████| 184/184 [00:23<00:00,  7.77it/s]

Epoch 1 - Train Loss: 0.0489729680120945, Train Pearson: 0.8006960153579712, Valid Loss: 0.047852773219347, Valid Pearson: 0.8112170100212097





Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/714 [00:00<?, ?it/s]

100%|██████████| 714/714 [01:31<00:00,  7.78it/s]

100%|██████████| 184/184 [00:23<00:00,  7.74it/s]


Epoch 2 - Train Loss: 0.03852643445134163, Train Pearson: 0.8171613812446594, Valid Loss: 0.03961557149887085, Valid Pearson: 0.818515419960022


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Pearson Correlation,▁█
epoch,▁█
training_loss,█▁
validation_loss,█▁

0,1
Pearson Correlation,0.81852
epoch,1.0
training_loss,0.03853
validation_loss,0.03962
