In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


qrels_df = pd.read_csv(r"D:\IR\Project\collections\msmarco-passage\qrels.dev.small.tsv", sep="\t", header=None, names=["query_id", "ignore", "doc_id", "relevance"])

# run.msmarco-passage.dev.small.tsv: initial BM25 ranking (query_id, document_id, rank)
bm25_df = pd.read_csv(r"D:\IR\Project\collections\msmarco-passage\run.msmarco-passage.dev.small.tsv", sep="\t", header=None, names=["query_id", "doc_id", "rank"])

# queries.dev.small.tsv: query texts
queries_df = pd.read_csv(r"D:\IR\Project\collections\msmarco-passage\queries.dev.small.tsv", sep="\t", header=None, names=["query_id", "query"])

# collection.tsv: document texts
collection_df = pd.read_csv(r"D:\IR\Project\collections\msmarco-passage\collection.tsv", sep="\t", header=None, names=["doc_id", "doc"])

In [3]:
merged_df = bm25_df.merge(qrels_df[['query_id', 'doc_id', 'relevance']], on=['query_id', 'doc_id'], how='left')
merged_df['relevance'] = merged_df['relevance'].fillna(0)

In [4]:
merged_df.head()

Unnamed: 0,query_id,doc_id,rank,relevance
0,1048585,7187158,1,1.0
1,1048585,7187157,2,0.0
2,1048585,7187163,3,0.0
3,1048585,7546327,4,0.0
4,1048585,7187160,5,0.0


In [5]:
data_df = merged_df.merge(queries_df, on='query_id').merge(collection_df, on='doc_id')

In [6]:
data_df.head()

Unnamed: 0,query_id,doc_id,rank,relevance,query,doc
0,1048585,7187158,1,1.0,what is paula deen's brother,Paula Deen and her brother Earl W. Bubba Hiers...
1,1048585,7187157,2,0.0,what is paula deen's brother,The New York Times. U.S. | National Briefing |...
2,1048585,7187163,3,0.0,what is paula deen's brother,Racial scandals aren't always bad for business...
3,1048585,7546327,4,0.0,what is paula deen's brother,What happened to Paula Deen's first husband? k...
4,1048585,7187160,5,0.0,what is paula deen's brother,Paula Deen & Brother Bubba Sued for Harassment...


In [7]:
data_df.shape

(6974598, 6)

In [8]:
queries = data_df['query'].tolist()
documents = data_df['doc'].tolist()
labels = data_df['relevance'].astype(int).tolist()

In [11]:
data_df.to_csv(r"D:\IR\Project\collections\msmarco-passage\final_data.csv", index=False)

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split

# Set device (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the data
# qrels.dev.small.tsv: relevance labels (query_id, document_id, relevance)
qrels_df = pd.read_csv("qrels.dev.small.tsv", sep="\t", header=None, names=["query_id", "ignore", "doc_id", "relevance"])

# run.msmarco-passage.dev.small.tsv: initial BM25 ranking (query_id, document_id, rank)
bm25_df = pd.read_csv("run.msmarco-passage.dev.small.tsv", sep="\t", header=None, names=["query_id", "doc_id", "rank"])

# queries.dev.small.tsv: query texts
queries_df = pd.read_csv("queries.dev.small.tsv", sep="\t", header=None, names=["query_id", "query"])

# collection.tsv: document texts
collection_df = pd.read_csv("collection.tsv", sep="\t", header=None, names=["doc_id", "doc"])

# Merge BM25 rankings with the relevance data
merged_df = bm25_df.merge(qrels_df[['query_id', 'doc_id', 'relevance']], on=['query_id', 'doc_id'], how='left')
merged_df['relevance'] = merged_df['relevance'].fillna(0)  # Non-relevant documents get label 0

# Merge with query and document texts
data_df = merged_df.merge(queries_df, on='query_id').merge(collection_df, on='doc_id')

# Prepare data for BERT fine-tuning (query, document, and labels)
queries = data_df['query'].tolist()
documents = data_df['doc'].tolist()
labels = data_df['relevance'].astype(int).tolist()

# Tokenizer and model (BERT for sequence classification)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)

# Create Dataset class
class RankingDataset(Dataset):
    def __init__(self, queries, documents, labels):
        self.queries = queries
        self.documents = documents
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        query = self.queries[idx]
        document = self.documents[idx]
        label = self.labels[idx]
        encoding = tokenizer.encode_plus(query, document, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        return input_ids, attention_mask, torch.tensor(label)

# Split the data into train and test sets
train_queries, val_queries, train_docs, val_docs, train_labels, val_labels = train_test_split(
    queries, documents, labels, test_size=0.1, random_state=42)

# Create DataLoader
train_dataset = RankingDataset(train_queries, train_docs, train_labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

val_dataset = RankingDataset(val_queries, val_docs, val_labels)
val_loader = DataLoader(val_dataset, batch_size=16)

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
def train_model(model, train_loader, val_loader, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')

        # Evaluation after each epoch
        evaluate_model(model, val_loader)

# Evaluation function
def evaluate_model(model, val_loader):
    model.eval()
    total = 0
    correct = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total
    print(f'Validation Accuracy: {accuracy:.4f}')
    model.train()

# Train the model
train_model(model, train_loader, val_loader, epochs=3)

# Inference and reranking
def rerank_documents(model, queries, documents):
    model.eval()
    rankings = []
    with torch.no_grad():
        for query, document in zip(queries, documents):
            encoding = tokenizer.encode_plus(query, document, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            score = torch.softmax(outputs.logits, dim=1)[0][1].item()  # Get the probability of relevance (class 1)
            rankings.append(score)
    return rankings

# Use rerank_documents to rerank the BM25 results
query_ids = data_df['query_id'].tolist()
doc_ids = data_df['doc_id'].tolist()
reranked_scores = rerank_documents(model, queries, documents)

# Combine the results into a DataFrame and sort by score for each query
results_df = pd.DataFrame({'query_id': query_ids, 'doc_id': doc_ids, 'score': reranked_scores})
results_df = results_df.sort_values(by=['query_id', 'score'], ascending=[True, False])

# Save the reranked results
results_df.to_csv("reranked_results.tsv", sep="\t", index=False)
