In [None]:
!pip install faiss-cpu
!pip install -U FlagEmbedding
!pip install peft

In [None]:
import torch
import pandas as pd
import numpy as np
import pickle
import json
from collections import defaultdict

import faiss
from FlagEmbedding import FlagModel

In [None]:
%%time

# Load the document embeddings
with open('/kaggle/input/m3-embedding-of-512-chunks/chunk_embedding.pkl', 'rb') as f:
    chunk_embedding_dict = pickle.load(f)
    
chunk_ids = list(chunk_embedding_dict.keys())
chunk_embeddings = np.array([chunk_embedding_dict[chunk_id] for chunk_id in chunk_ids]).astype('float32')

In [None]:
%%time

# Load the test queries
test_path = '/kaggle/input/dis-project-1-document-retrieval/test.csv'
test_df = pd.read_csv(test_path)

# Load the model
model = FlagModel('BAAI/bge-m3',
                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
                  use_fp16=True)

# Embed the test queries
queries = test_df['query'].tolist()
query_ids = test_df['id'].tolist()
query_embeddings = model.encode(queries).astype('float32')

In [None]:
%%time

k = 100  # Number of nearest neighbors to retrieve

# Normalize the corpus matrix
faiss.normalize_L2(chunk_embeddings)

# Initialize a FAISS index
d = chunk_embeddings.shape[1]  # Dimensionality of embeddings
index = faiss.IndexFlatIP(d)  # IP = Inner Product, effectively cosine similarity after normalization
index.add(chunk_embeddings)  # Add document embeddings to the FAISS index

# Normalize query embeddings
faiss.normalize_L2(query_embeddings)

# Perform the search and retrieve top 100 results
distances, indices = index.search(query_embeddings, k)

In [None]:
# Map the indices back to document IDs
top_k_chunks = {
    query_id: [chunk_ids[idx] for idx in indices[i]]
    for i, query_id in enumerate(query_ids)
}

# Optional: Convert results to a DataFrame for easier access
top_k_chunks_df = pd.DataFrame({
    'id': query_ids,
    'chunkids': [top_k_chunks[qid] for qid in query_ids]
})

# Display the top results
top_k_chunks_df.head()

In [None]:
def rank_and_aggregate(df, aggregate_num=10):
    results = {}

    # Loop through each query's candidates
    for _, row in df.iterrows():
        doc_scores = defaultdict(float)
        query_id = row['id']
        chunks = row['chunkids']

        # Weight each chunk based on its position in the list (higher rank -> higher weight)
        for rank, chunk_id in enumerate(chunks, start=1):
            # Extract the document ID part (everything before "_chunk")
            doc_id = "_".join(chunk_id.split("_")[:-2])
            # Calculate weight, for example, inversely proportional to the rank
            score = 1 / rank  # Adjust the weighting function if needed

            # Aggregate scores for each document
            doc_scores[doc_id] += score

        # Get the top 10 documents based on cumulative scores
        top_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)[:aggregate_num]
        results[query_id] = [doc for doc, score in top_docs]

    # Convert results to DataFrame for easier access
    top_results_df = pd.DataFrame(list(results.items()), columns=['id', 'docids'])
    return top_results_df

# Apply the function
top_results_df = rank_and_aggregate(top_k_chunks_df)
top_results_df.head()  # Display the top results

In [None]:
# Create the submission DataFrame
submission_df = pd.DataFrame(top_results_df)

# Save the submission file
submission_df.to_csv('/kaggle/working/submission.csv', index=False)