In [None]:
import torch
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification
import transformers
from torch.utils.data import Dataset, DataLoader
#from transformers import *
import torch.optim as optim
# from tqdm import tqdm
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import scipy
from scipy.stats import pearsonr
from matplotlib import pyplot as plt

In [None]:
import wandb

# Set your wandb API key
wandb_api_key = "e62fc492915628e64fcac9c082089ffed84dc72d"

# Log in to wandb using the API key
wandb.login(key=wandb_api_key)

# Initialize wandb
wandb.init(project="nlp3-1a")

# Now you can use wandb for logging during training

In [None]:
torch.manual_seed(0)
tokenizer=AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
train_path="/kaggle/input/assignment-3/A3_task1_data_files/train.csv"
val_path="/kaggle/input/assignment-3/A3_task1_data_files/dev.csv"

In [None]:
class TextPairDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data=data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence_pair = self.data.iloc[idx, 1:3].values.tolist()
        
        encoded_pair = self.tokenizer(sentence_pair[0], sentence_pair[1],
                                      add_special_tokens=True, 
                                      padding='max_length', 
                                      truncation=True, 
                                      return_tensors='pt')
        input_ids = encoded_pair['input_ids'].squeeze(0)
        attention_mask = encoded_pair['attention_mask'].squeeze(0)
        labels = torch.tensor(self.data.iloc[idx, 0])
        return input_ids,attention_mask,labels

In [None]:
BATCH_SIZE=16
DEVICE='cuda' if torch.cuda.is_available() else 'cpu'
NUM_EPOCHS=10

In [None]:
def dataloaderbuilder(filepath,batch_size):
    data=pd.read_csv(filepath,sep='\t').dropna()
    dataset=TextPairDataset(data,tokenizer)
    loader=DataLoader(dataset,batch_size=batch_size,shuffle=False)
    return loader

In [None]:
train_dataloader=dataloaderbuilder(train_path,BATCH_SIZE)
val_dataloader=dataloaderbuilder(val_path,BATCH_SIZE)

In [None]:
torch.cuda.empty_cache()

In [None]:
def plotgraph(train_losses,val_losses):
    x=[i+1 for i in range(NUM_EPOCHS)]
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss per Epoch')
    plt.legend()
    plt.savefig('loss_graph.png')
    plt.show()

In [None]:
from torch import nn
model=BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)
model.classifier = nn.Sequential(
    nn.Linear(model.config.hidden_size, 1),
    nn.Sigmoid()  # Output float between 0 and 1
)
model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
loss_fn=nn.MSELoss()

In [None]:
import gc
train_losses=[]
val_losses=[]
def train_epoch(model, optimizer,epoch):
    model.train()
    losses = 0
    progress = tqdm(train_dataloader, desc=f"Epoch:{epoch}",total=len(train_dataloader), leave=False)
    i=0
    for batch in progress:
        input_ids = batch[0].to(DEVICE)
        attention_mask = batch[1].to(DEVICE)
        labels = batch[2].to(DEVICE)
        optimizer.zero_grad()
        logits=model(input_ids,attention_mask=attention_mask).logits.to(torch.float64).view(-1)*5
        loss=loss_fn(logits,labels)
        losses += loss.item()
#         print(i,loss.item())
        loss.backward()
        optimizer.step()
        del input_ids
        del attention_mask
        del labels
        del logits
        gc.collect()
        torch.cuda.empty_cache()
        progress.set_postfix({'training_loss': f'{loss.item()/len(batch):.3f}'})
    x = losses / len(list(train_dataloader))
    train_losses.append(x)
    wandb.log({'epoch':epoch,'train_loss':x})
    tqdm.write(f"Epoch:{epoch}, Avg Train Loss: {x}")
    gc.collect()
    torch.cuda.empty_cache()
    return x



def evaluate(model,epoch):
    with torch.no_grad():
        model.eval()
        losses = 0
        batch_pred=[]
        batch_truth=[]
        for batch in val_dataloader:
            input_ids = batch[0].to(DEVICE)
            attention_mask = batch[1].to(DEVICE)
            labels = batch[2].to(DEVICE)

            logits=model(input_ids,attention_mask=attention_mask).logits.to(torch.float64).view(-1)*5
            batch_pred.append(logits)
            batch_truth.append(labels)

            loss=loss_fn(logits,labels)
            losses += loss.item()
            del input_ids
            del attention_mask
            del labels
            del logits
            gc.collect()
            torch.cuda.empty_cache()
        predicted_scores=torch.cat(batch_pred)
        ground_truth_labels=torch.cat(batch_truth)
        pearson_coefficient, _ = pearsonr(predicted_scores.cpu(), ground_truth_labels.cpu())
        print("Pearson Correlation Coefficient:", pearson_coefficient)

        x = losses / len(list(val_dataloader))
        val_losses.append(x)
        wandb.log({'epoch':epoch,'val_loss':x,'val_pearson':pearson_coefficient})
        del predicted_scores
        del ground_truth_labels
        del pearson_coefficient
        gc.collect()
        torch.cuda.empty_cache()
        return x

In [None]:
for epoch in range(1, NUM_EPOCHS+1):
    train_loss = train_epoch(model, optimizer,epoch)
    val_loss = evaluate(model,epoch)
    torch.save(model.state_dict(), f"model1A_epoch_{epoch}.pth")
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}"))

In [None]:
wandb.finish()
torch.cuda.empty_cache()

In [None]:
#for loading val
# def evaluate(model):
#     with torch.no_grad():
#         model.eval()
#         losses = 0
#         batch_pred=[]
#         batch_truth=[]
#         for batch in val_dataloader:
#             input_ids = batch[0].to(DEVICE)
#             attention_mask = batch[1].to(DEVICE)
#             labels = batch[2].to(DEVICE)

#             logits=model(input_ids,attention_mask=attention_mask).logits.to(torch.float64).view(-1)*5
#             batch_pred.append(logits)
#             batch_truth.append(labels)
#             loss=loss_fn(logits,labels)
#             losses += loss.item()
#             del input_ids
#             del attention_mask
#             del labels
#             del logits
#             gc.collect()
#             torch.cuda.empty_cache()
#         predicted_scores=torch.cat(batch_pred)
#         ground_truth_labels=torch.cat(batch_truth)
#         pearson_coefficient, _ = pearsonr(predicted_scores.cpu(), ground_truth_labels.cpu())
#         print("Pearson Correlation Coefficient:", pearson_coefficient)
#         print(predicted_scores)
#         print(ground_truth_labels)
#         x = losses / len(list(val_dataloader))
#         val_losses.append(x)

#         return x
# PATH="/kaggle/input/831asaves/model1A_epoch_10.pth"
# from torch import nn
# model=BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)
# model.classifier = nn.Sequential(
#     nn.Linear(model.config.hidden_size, 1),
#     nn.Sigmoid()  # Output float between 0 and 1
# )
# model.load_state_dict(torch.load(PATH))
# model.to(DEVICE)
# loss_fn=nn.MSELoss()
# val_loss = evaluate(model)