In [1]:
import numpy as np
import os
import pandas as pd
from torch.utils.data import Dataset,DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import torch.nn.functional as F
import torch
from torch import Tensor
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import transformers
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score
from transformers import BitsAndBytesConfig
from tqdm import tqdm 
from torch.optim.lr_scheduler import StepLR
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device: " ,device)

device:  cuda


## data prepration

In [2]:
root = 'data/arena'

train_df = pd.read_csv(os.path.join(root,"train.csv"))
test_df = pd.read_csv(os.path.join(root,"test.csv"))
sample_submission_df = pd.read_csv(os.path.join(root,"sample_submission.csv"))

In [3]:
def process(input_str):
    stripped_str = input_str.strip('[]')
    sentences = [s.strip('"') for s in stripped_str.split('","')]
    return  ' '.join(sentences)

train_df['prompt'] = train_df['prompt'].apply(process)
train_df['response_a'] = train_df['response_a'].apply(process)
train_df['response_b'] = train_df['response_b'].apply(process)

test_df['prompt'] = test_df['prompt'].apply(process)
test_df['response_a'] = test_df['response_a'].apply(process)
test_df['response_b'] = test_df['response_b'].apply(process)


print(f"train shape: {train_df.shape}")
print(f"test shape: {test_df.shape}")
print("-"*90)
print(f"train missing values: {train_df.isnull().sum().sum()}")
print(f"test missing values: {test_df.isnull().sum().sum()}")
print("-"*90)

train shape: (57477, 9)
test shape: (3, 4)
------------------------------------------------------------------------------------------
train missing values: 0
test missing values: 0
------------------------------------------------------------------------------------------


In [4]:
def lengths(column):
    return column.str.len()

# Calculate lengths of the entries in each column
train_df['prompt_length'] = lengths(train_df['prompt'])
train_df['response_a_length'] = lengths(train_df['response_a'])
train_df['response_b_length'] = lengths(train_df['response_b'])

# Aggregate statistics
statistics = {
    'prompt': {
        'max_length': train_df['prompt_length'].max(),
        'median_length': train_df['prompt_length'].median(),
        'std_length': train_df['prompt_length'].std()
    },
    'response_a': {
        'max_length': train_df['response_a_length'].max(),
        'median_length': train_df['response_a_length'].median(),
        'std_length': train_df['response_a_length'].std()
    },
    'response_b': {
        'max_length': train_df['response_b_length'].max(),
        'median_length': train_df['response_b_length'].median(),
        'std_length': train_df['response_b_length'].std()
    }
}

# Convert the statistics dictionary to a DataFrame for better readability
statistics_df = pd.DataFrame(statistics).T

print(statistics_df)

            max_length  median_length   std_length
prompt         33044.0           92.0  1073.120265
response_a     53976.0         1072.0  1512.843807
response_b     53764.0         1082.0  1536.733530


In [5]:
train_df['winner_tie']
# train_df['winner_a_binary'] = train_df['winner_model_a'].apply(lambda x : 1 if x == 1 else 0)
# train_df['winner_a_binary'].sum()

0        0
1        0
2        1
3        0
4        0
        ..
57472    0
57473    0
57474    0
57475    0
57476    0
Name: winner_tie, Length: 57477, dtype: int64

In [6]:
train = train_df[['prompt','response_a','winner_model_a']][:25000]#40000
train_data,val_data = train_test_split(train,test_size=0.2,random_state=42)

In [7]:
max_tokens = 1024

In [8]:
class ColBERTDataset(Dataset):
    def __init__(self,data,tokenizer,max_len=max_tokens):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len =max_len
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,index):
        row = self.data.iloc[index]
        
        prompt = row['prompt']
        response_a = row['response_a']
        # response_b = row['response_b']
        winner_a = row['winner_model_a']
        # winner_b = row['winner_model_b']
        # winner_tie = row['winner_tie']
        
        
        prompt_tokenized = self.tokenizer(prompt,truncation=True,padding='max_length',max_length = self.max_len,return_tensors='pt')
        response_a_tokenized = self.tokenizer(response_a,truncation=True,padding='max_length',max_length = self.max_len,return_tensors='pt')
        # response_b_tokenized = self.tokenizer(response_b,truncation=True,padding='max_length',max_length = self.max_len,return_tensors='pt')

        return {
            'prompt_inputs' :prompt_tokenized['input_ids'].squeeze(),
            'prompt_attention_mask': prompt_tokenized['attention_mask'].squeeze(),
            
            'response_a_inputs' :response_a_tokenized['input_ids'].squeeze(),
            'response_a_attention_mask': response_a_tokenized['attention_mask'].squeeze(),
            
            # 'response_b_inputs' :response_b_tokenized['input_ids'].squeeze(),
            # 'response_b_attention_mask': response_b_tokenized['attention_mask'].squeeze(),
            
            'winner_model_a': torch.tensor(winner_a, dtype=torch.float),
            # 'winner_model_b': torch.tensor(winner_b, dtype=torch.float),
            # 'winner_tie': torch.tensor(winner_tie, dtype=torch.float)
        }

In [9]:
tokenizer = AutoTokenizer.from_pretrained('allenai/longformer-base-4096')
# Create dataset and dataloaders
train_dataset = ColBERTDataset(train, tokenizer)
val_dataset = ColBERTDataset(val_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)




In [10]:
# for i in train_loader:
#     print(i)
#     break

In [18]:
# i.keys()

dict_keys(['prompt_inputs', 'prompt_attention_mask', 'response_a_inputs', 'response_a_attention_mask', 'winner_model_a'])

## modelling

In [24]:
# class ResponseClassifier(nn.Module):
#     def __init__(self,model_name,hidden_size,intermediate_size,num_classes,num_heads=8):
#         super(ResponseClassifier,self).__init__()
#         self.model = AutoModel.from_pretrained(model_name).to(device)
#         self.attention = nn.MultiheadAttention(embed_dim=hidden_size,
#                                                num_heads=num_heads,
#                                                batch_first=True)
#         self.classifier = nn.Sequential(
#             nn.Linear(hidden_size*2,intermediate_size),
#             nn.ReLU(),
#             nn.Linear(intermediate_size,num_classes)
#         )

#     def forward(self,prompt_embeddings,response_embeddings):
#         combined_embeddings = torch.cat((prompt_embeddings, response_embeddings), dim=1)
        
#         attended_output, _ = self.attention(combined_embeddings, combined_embeddings, combined_embeddings)
        
#         pooled_output = torch.mean(attended_output, dim=1)
        
#         logits = self.classifier(pooled_output)
#         return logits

In [12]:
class ResponseClassifier(nn.Module):
    def __init__(self, model_name, hidden_size, intermediate_size, num_classes, num_heads=8):
        super(ResponseClassifier, self).__init__()
        
        self.embedding_model = AutoModel.from_pretrained(model_name)
        
        self.attention_layer_a = nn.MultiheadAttention(embed_dim=hidden_size, num_heads=num_heads, batch_first=True, dropout=0.2)
        # self.attention_layer_b = nn.MultiheadAttention(embed_dim=hidden_size, num_heads=num_heads, batch_first=True, dropout=0.2)
        
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, intermediate_size),
            nn.ReLU(),
            nn.Linear(intermediate_size, num_classes)
        )
            
    def forward(self, prompt_embeddings, response_a_embeddings):
        # Attention on prompt and response_a
        combined_embeddings_a = torch.cat((prompt_embeddings, response_a_embeddings), dim=1)
        attended_output_a, _ = self.attention_layer_a(combined_embeddings_a, combined_embeddings_a, combined_embeddings_a)
        
        # Attention on prompt and response_b
        # combined_embeddings_b = torch.cat((prompt_embeddings, response_b_embeddings), dim=1)
        # attended_output_b, _ = self.attention_layer_b(combined_embeddings_b, combined_embeddings_b, combined_embeddings_b)
        
        # Pooling the attended outputs
        pooled_output_a = torch.mean(attended_output_a, dim=1)
        # pooled_output_b = torch.mean(attended_output_b, dim=1)
        
        # Combine pooled outputs from both attentions
        # combined_pooled_output = torch.cat((pooled_output_a, pooled_output_b), dim=1)
        
        # Final classification
        logits = self.classifier(pooled_output_a)
        return logits

In [18]:
# allenai/longformer-base-4096

In [11]:
## testing code
test =  train_df[['prompt','response_a','winner_model_a']][50000:]
def get_predictions(test_df, model, tokenizer, device):
    model.embedding_model.eval()
    model.eval()
    
    winner_a_predicted = []
    # winner_b_predicted = []
    # winner_tie_predicted = []

    with torch.no_grad():
        for _, row in test_df.iterrows():
            prompt = row['prompt']
            response_a = row['response_a']
            # response_b = row['response_b']
            
            prompt_tokenized = tokenizer(prompt, truncation=True, padding='max_length', max_length=max_tokens, return_tensors='pt')
            response_a_tokenized = tokenizer(response_a, truncation=True, padding='max_length', max_length=max_tokens, return_tensors='pt')
            # response_b_tokenized = tokenizer(response_b, truncation=True, padding='max_length', max_length=max_tokens, return_tensors='pt')

            prompt_input_ids = prompt_tokenized['input_ids'].to(device)
            prompt_attention_mask = prompt_tokenized['attention_mask'].to(device)

            response_a_input_ids = response_a_tokenized['input_ids'].to(device)
            response_a_attention_mask = response_a_tokenized['attention_mask'].to(device)

            # response_b_input_ids = response_b_tokenized['input_ids'].to(device)
            # response_b_attention_mask = response_b_tokenized['attention_mask'].to(device)

            
            prompt_embeddings = model.embedding_model(prompt_input_ids,prompt_attention_mask).last_hidden_state
            response_a_embeddings = model.embedding_model(response_a_input_ids,response_a_attention_mask).last_hidden_state
            # response_b_embeddings = model.embedding_model(response_b_input_ids,response_b_attention_mask).last_hidden_state

            output = model(prompt_embeddings,response_a_embeddings)
            # Extract probabilities
            probabilities = torch.softmax(output, dim=1).cpu().numpy().flatten()

            winner_a_predicted.append(probabilities[0])
            # winner_b_predicted.append(probabilities[1])
            # winner_tie_predicted.append(probabilities[2])
            
    test_df['winner_a_predicted'] = winner_a_predicted
    # test_df['winner_b_predicted'] = winner_b_predicted
    # test_df['winner_tie_predicted'] = winner_tie_predicted

    return test_df

def get_accuracy(test):
    test['predicted_class'] = (test['winner_a_predicted'] >= 0.5).astype(int)
    
    true_a = test['winner_model_a'].tolist()
    pred_a = test['predicted_class'].tolist()

    # Calculate accuracy for response A wins
    accuracy_a = accuracy_score(true_a, pred_a)
    print(f"Accuracy for response A wins: {accuracy_a:.4f}")



# def get_accuracy(test):
#     test['predicted_class'] = test[['winner_a_predicted', 'winner_b_predicted', 'winner_tie_predicted']].idxmax(axis=1)
#     test['predicted_class'] = test['predicted_class'].map({
#         'winner_a_predicted': 'winner_model_a',
#         'winner_b_predicted': 'winner_model_b',
#         'winner_tie_predicted': 'winner_tie'
#     })
#     true_a, pred_a = [], []
#     true_b, pred_b = [], []
#     true_tie, pred_tie = [], []

#     # Populate the lists with corresponding values
#     for idx, row in test.iterrows():
#         if row['winner_model_a'] == 1:
#             true_a.append(1)
#             pred_a.append(1 if row['predicted_class'] == 'winner_model_a' else 0)
#         else:
#             true_a.append(0)
#             pred_a.append(1 if row['predicted_class'] == 'winner_model_a' else 0)

#         if row['winner_model_b'] == 1:
#             true_b.append(1)
#             pred_b.append(1 if row['predicted_class'] == 'winner_model_b' else 0)
#         else:
#             true_b.append(0)
#             pred_b.append(1 if row['predicted_class'] == 'winner_model_b' else 0)

#         if row['winner_tie'] == 1:
#             true_tie.append(1)
#             pred_tie.append(1 if row['predicted_class'] == 'winner_tie' else 0)
#         else:
#             true_tie.append(0)
#             pred_tie.append(1 if row['predicted_class'] == 'winner_tie' else 0)

#     # Calculate accuracy for each class
#     accuracy_a = accuracy_score(true_a, pred_a)
#     accuracy_b = accuracy_score(true_b, pred_b)
#     accuracy_tie = accuracy_score(true_tie, pred_tie)
#     print(f"Accuracy for response A wins: {accuracy_a:.4f}")
#     print(f"Accuracy for response B wins: {accuracy_b:.4f}")
#     print(f"Accuracy for tie: {accuracy_tie:.4f}")

In [14]:
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from tqdm import tqdm

model_name = 'allenai/longformer-base-4096'
hidden_size = 768
intermediate_size = 512
num_classes = 1
learning_rate = 2e-4
num_epochs = 4
PRINT_FREQ = 500

model = ResponseClassifier(model_name, hidden_size, intermediate_size,num_classes ,num_heads=8).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.BCEWithLogitsLoss()
scheduler = lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.5, total_iters=30)

2024-08-05 18:16:24.319644: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-05 18:16:24.364968: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  return torch.load(checkpoint_file, map_location="cpu")


In [16]:
# model = ResponseClassifier(model_name,hidden_size,intermediate_size,num_classes,num_heads=8).to(device)
# optimizer = torch.optim.Adam(model.parameters(),lr = learing_rate)
# criterion = nn.BCELoss()
# scheduler = lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.5, total_iters=30)
# try:
#     model.load_state_dict(torch.load('/home/smart/sanket/experiments/model_2.pth'))
#     print('loaded saved model successfully')
# except Exception as e:
#     print('initialized model')

  return torch.load(checkpoint_file, map_location="cpu")
  model.load_state_dict(torch.load('/home/smart/sanket/experiments/model_2.pth'))


initialized model


In [15]:
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    total_loss = 0
    steps = 0
    
    for batch_index, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")):
        prompt_input_ids = batch['prompt_inputs'].to(device)
        prompt_attention_mask = batch['prompt_attention_mask'].to(device)
        response_a_input_ids = batch['response_a_inputs'].to(device)
        response_a_attention_mask = batch['response_a_attention_mask'].to(device)
        winner_a = batch['winner_model_a'].to(device)

        optimizer.zero_grad()
        
        with torch.no_grad():
            prompt_embeddings = model.embedding_model(prompt_input_ids, prompt_attention_mask).last_hidden_state
            response_a_embeddings = model.embedding_model(response_a_input_ids, response_a_attention_mask).last_hidden_state

        logits = model(prompt_embeddings, response_a_embeddings).squeeze(-1)
        loss = criterion(logits, winner_a.float())
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        train_loss += loss.item()
        steps += 1
        if (batch_index + 1) % PRINT_FREQ == 0:
            current_lr = optimizer.param_groups[0]["lr"]
            print(
                f'  Epoch: {epoch+1}',
                f'  Batch: {batch_index + 1}/{len(train_loader)}',
                f'  Train Loss: {total_loss / steps:.4f}',
                f'  LR: {current_lr:.1e}', flush=True
            )
            total_loss = 0
            steps = 0

    model.eval()
    val_loss = 0
    before_lr = optimizer.param_groups[0]["lr"]
    scheduler.step()
    after_lr = optimizer.param_groups[0]["lr"]
    print("Epoch %d: adam lr %.4f -> %.4f" % (epoch, before_lr, after_lr))

    with torch.no_grad():
        for batch in val_loader:
            prompt_input_ids = batch['prompt_inputs'].to(device)
            prompt_attention_mask = batch['prompt_attention_mask'].to(device)
            response_a_input_ids = batch['response_a_inputs'].to(device)
            response_a_attention_mask = batch['response_a_attention_mask'].to(device)
            winner_a = batch['winner_model_a'].to(device)

            with torch.no_grad():
                prompt_embeddings = model.embedding_model(prompt_input_ids, prompt_attention_mask).last_hidden_state
                response_a_embeddings = model.embedding_model(response_a_input_ids, response_a_attention_mask).last_hidden_state

            logits = model(prompt_embeddings, response_a_embeddings).squeeze(-1)
            val_loss += criterion(logits, winner_a.float()).item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")
    
    test = get_predictions(test, model, tokenizer, device)
    get_accuracy(test)
    
    if val_loss < val_loss_threshold:
        val_loss_threshold = val_loss
        torch.save(model.state_dict(), f'model_a_{epoch+1}.pth')
        print('saving model...')
    
    print('============================================================================')

Epoch 1/4:  32%|███▏      | 499/1563 [09:59<21:27,  1.21s/it]

  Epoch: 1   Batch: 500/1563   Train Loss: 0.6452   LR: 2.0e-04


Epoch 1/4:  56%|█████▌    | 875/1563 [17:31<13:49,  1.21s/it]

## sd 

In [10]:
# model = AutoModel.from_pretrained("allenai/longformer-base-4096").to(device) #
# tokenizer  = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")



In [4]:
# class classifier_model(nn.Module):
#     def __init__(self,model,tokenizer):
#         super(classifier_model,self).__init__()
#         self.model = model
#         self.tokenizer = tokenizer

#     def forward(self,prompt,resp_a,resp_b):
#         tokenized_prompt = self.tokenizer(prompt,truncation=True,padding='max_length',max_length = 1024,return_tensors='pt').to(device)
#         tokenized_resp_a = self.tokenizer(resp_a,truncation=True,padding='max_length',max_length = 1024,return_tensors='pt').to(device)
#         tokenized_resp_b = self.tokenizer(resp_b,truncation=True,padding='max_length',max_length = 1024,return_tensors='pt').to(device)
        
#         prompt_hs = self.model(**tokenized_prompt).last_hidden_state
#         resp_a_hs = self.model(**tokenized_resp_a).last_hidden_state
#         resp_b_hs = self.model(**tokenized_resp_b).last_hidden_state


#         return prompt_hs,resp_a_hs,resp_b_hs

In [5]:
# classifier = classifier_model(model,tokenizer)

In [6]:
# p,a,b = classifier("hi, how are you?","i'm good, what about you?","capital of india is delhi")

In [7]:
# p.shape

torch.Size([1, 1024, 768])

In [12]:
# linear = nn.Linear(in_features=768,out_features=128).to(device)

In [13]:
# linear(torch.cat((p,a),dim=1)).shape,linear(torch.cat((p,b),dim=1)).shape