In [1]:
import torch
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification
import transformers
from torch.utils.data import Dataset, DataLoader
#from transformers import *
import torch.optim as optim
# from tqdm import tqdm
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import scipy
from scipy.stats import pearsonr

In [2]:
!pip install wandb



In [3]:
import wandb

# Set your wandb API key
wandb_api_key = "e62fc492915628e64fcac9c082089ffed84dc72d"

# Log in to wandb using the API key
wandb.login(key=wandb_api_key)

# Initialize wandb
wandb.init(project="nlp3-1a")

# Now you can use wandb for logging during training

[34m[1mwandb[0m: Currently logged in as: [33mkaran21258[0m ([33mkaran912[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [4]:
torch.manual_seed(0)
tokenizer=AutoTokenizer.from_pretrained('bert-base-uncased')

In [5]:
test_path="/kaggle/input/assignment-3/data/Task1/sample_test.csv"

In [6]:
class TextPairDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data=data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence_pair = self.data.iloc[idx, 1:3].values.tolist()
        
        encoded_pair = self.tokenizer(sentence_pair[0], sentence_pair[1],
                                      add_special_tokens=True, 
                                      padding='max_length', 
                                      truncation=True, 
                                      return_tensors='pt')
        input_ids = encoded_pair['input_ids'].squeeze(0)
        attention_mask = encoded_pair['attention_mask'].squeeze(0)
        return input_ids,attention_mask

In [7]:
BATCH_SIZE=16
DEVICE='cuda' if torch.cuda.is_available() else 'cpu'
NUM_EPOCHS=10

In [8]:
# train_dataset=TextPairDataset(train_data,tokenizer)
# val_dataset=TextPairDataset(val_data,tokenizer)

# train_loader=DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=False)
# val_dataloader=DataLoader(val_dataset,batch_size=BATCH_SIZE,shuffle=False)

In [9]:
original_data=pd.read_csv(test_path,sep='\t')

In [10]:
def dataloaderbuilder(filepath,batch_size):
    
    dataset=TextPairDataset(original_data,tokenizer)
    loader=DataLoader(dataset,batch_size=batch_size,shuffle=False)
    return loader

In [11]:
test_dataloader=dataloaderbuilder(test_path,BATCH_SIZE)

In [12]:
torch.cuda.empty_cache()

In [13]:
def plotgraph(train_losses,val_losses):
    x=[i+1 for i in range(NUM_EPOCHS)]
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss per Epoch')
    plt.legend()
    plt.savefig('loss_graph.png')
    plt.show()

In [14]:
from torch import nn
model=BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)
model.classifier = nn.Sequential(
    nn.Linear(model.config.hidden_size, 1),
    nn.Sigmoid()  # Output float between 0 and 1
)
model.to(DEVICE)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
loss_fn=nn.MSELoss()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
import gc
train_losses=[]
val_losses=[]
def train_epoch(model, optimizer,epoch):
    model.train()
    losses = 0
    progress = tqdm(train_dataloader, desc=f"Epoch:{epoch}",total=len(train_dataloader), leave=False)
    i=0
    for batch in progress:
        input_ids = batch[0].to(DEVICE)
        attention_mask = batch[1].to(DEVICE)
        labels = batch[2].to(DEVICE)
        optimizer.zero_grad()
        logits=model(input_ids,attention_mask=attention_mask).logits.to(torch.float64).view(-1)*5
        loss=loss_fn(logits,labels)
        losses += loss.item()
#         print(i,loss.item())
        loss.backward()
        optimizer.step()
        del input_ids
        del attention_mask
        del labels
        del logits
        gc.collect()
        torch.cuda.empty_cache()
        progress.set_postfix({'training_loss': f'{loss.item()/len(batch):.3f}'})
    x = losses / len(list(train_dataloader))
    train_losses.append(x)
    wandb.log({'epoch':epoch,'train_loss':x})
    tqdm.write(f"Epoch:{epoch}, Avg Train Loss: {x}")
    gc.collect()
    torch.cuda.empty_cache()
    return x



def evaluate(model,val_dataloader):
    with torch.no_grad():
        model.eval()
        losses = 0
        
        total_logits=[]
        for batch in val_dataloader:
            input_ids = batch[0].to(DEVICE)
            attention_mask = batch[1].to(DEVICE)
            logits=model(input_ids,attention_mask=attention_mask).logits.to(torch.float64).view(-1)*5
            total_logits.extend(list(logits.cpu().detach().numpy()))
            
        return total_logits

In [43]:
loaded_transformer = model
loaded_transformer.load_state_dict(torch.load("/kaggle/input/a3-1a-models/model1A_epoch_10.pth"))
model=loaded_transformer
test_pred = evaluate(model,test_dataloader)
data={"scores":test_pred,"sentence1":original_data['setence1'],"sentence2":original_data['sentence2']}
final_df=pd.DataFrame(data)
final_df.to_csv("sample_demo.csv")

[4.878290593624115, 4.878197014331818, 4.877212345600128, 2.643692195415497, 2.6470929384231567, 2.586703896522522]


In [None]:
wandb.finish()
torch.cuda.empty_cache()


In [None]:
#test
