In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
data_df = pd.read_csv(r"D:\IR\Project\collections\msmarco-passage\final_data.csv")

In [3]:
queries = data_df['query'].tolist()
documents = data_df['doc'].tolist()
labels = data_df['relevance'].astype(int).tolist()

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
class RankingDataset(Dataset):
    def __init__(self, queries, documents, labels):
        self.queries = queries
        self.documents = documents
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        query = self.queries[idx]
        document = self.documents[idx]
        label = self.labels[idx]
        encoding = tokenizer.encode_plus(query, document, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        return input_ids, attention_mask, torch.tensor(label)

# Split the data into train and test sets
train_queries, val_queries, train_docs, val_docs, train_labels, val_labels = train_test_split(
    queries, documents, labels, test_size=0.1, random_state=42)

# Create DataLoader
train_dataset = RankingDataset(train_queries, train_docs, train_labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

val_dataset = RankingDataset(val_queries, val_docs, val_labels)
val_loader = DataLoader(val_dataset, batch_size=16)

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5)