In [1]:
!pip install FlagEmbedding
!pip install scann
!pip install faiss-cpu

Collecting FlagEmbedding
  Downloading FlagEmbedding-1.3.2.tar.gz (177 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.8/177.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting transformers==4.44.2
  Downloading transformers-4.44.2-py3-none-any.whl (9.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting datasets==2.19.0
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
Collecting sentence_transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl (255 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.8/255.8 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.13.2-py3-none-any.whl (320 kB)
[2K     [90m━

In [2]:
import numpy as np
import pandas as pd

import pickle
import torch
import json
import os

from collections import defaultdict
from FlagEmbedding import FlagModel
import scann

from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm
2024-11-07 05:54:28.425829: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-07 05:54:28.448798: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-07 05:54:28.455897: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Vector for loading documents
with open('/kaggle/input/m3-embedding/m3_chunk_128_embedding.pkl', 'rb') as f:
    doc_embeddings_dict = pickle.load(f)

In [4]:
doc_ids = list(doc_embeddings_dict.keys())
doc_embeddings = np.array([doc_embeddings_dict[doc_id] for doc_id in doc_ids]).astype('float32')
del doc_embeddings_dict

In [5]:
# Load the test csv
test_path = '/kaggle/input/dis-project-1-document-retrieval/test.csv'
test_df = pd.read_csv(test_path)

# Load the model
model = FlagModel('BAAI/bge-m3',
                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
                  use_fp16=True)

# Embed the test queries
queries = test_df['query'].tolist()
query_ids = test_df['id'].tolist()
query_embeddings = model.encode(queries).astype('float32')


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Inference Embeddings: 100%|██████████| 8/8 [02:21<00:00, 17.73s/it] 


In [6]:
%%time

k = 100  # Number of nearest neighbors to retrieve

# Step 1: Normalize document embeddings (L2 normalization is assumed in SCaNN)
normalized_doc_embeddings = doc_embeddings / np.linalg.norm(doc_embeddings, axis=1, keepdims=True)

# Step 2: Initialize SCaNN index
d = normalized_doc_embeddings.shape[1]  # Dimensionality of embeddings
index = scann.scann_ops_pybind.builder(normalized_doc_embeddings, k, "dot_product").tree(
    num_leaves=200, num_leaves_to_search=50, training_sample_size=250000
).score_ah(2, anisotropic_quantization_threshold=0.2).build()

# Step 3: Normalize query embeddings
normalized_query_embeddings = query_embeddings / np.linalg.norm(query_embeddings, axis=1, keepdims=True)

# Step 4: Perform the search
indices, distances = index.search_batched(normalized_query_embeddings)

CPU times: user 51min 20s, sys: 1min 10s, total: 52min 31s
Wall time: 3min 40s


In [7]:
# Map the indices back to document IDs
top_k_chunks = {
    query_id: [doc_ids[idx] for idx in indices[i]]
    for i, query_id in enumerate(query_ids)
}

# Optional: Convert results to a DataFrame for easier access
top_k_chunks_df = pd.DataFrame({
    'id': query_ids,
    'chunkids': [top_k_chunks[qid] for qid in query_ids]
})

# Display the top results
top_k_chunks_df.head()

Unnamed: 0,id,chunkids
0,0,"[doc-en-0_chunk_52, doc-en-0_chunk_53, doc-en-..."
1,1,"[doc-en-16_chunk_228, doc-en-16_chunk_224, doc..."
2,2,"[doc-it-14111_chunk_22, doc-en-32_chunk_34, do..."
3,3,"[doc-en-44779_chunk_4, doc-en-1915_chunk_43, d..."
4,4,"[doc-en-56_chunk_56, doc-en-56_chunk_52, doc-e..."


In [8]:
def rank_and_aggregate(df, aggregate_num=10):
    results = {}
    # Loop through each query's candidates
    for _, row in df.iterrows():
        doc_scores = defaultdict(float)
        query_id = row['id']
        chunks = row['chunkids']

        # Weight each chunk based on its position in the list (higher rank -> higher weight)
        for rank, chunk_id in enumerate(chunks, start=1):
            # Extract the document ID part (everything before "_chunk")
            doc_id = "_".join(chunk_id.split("_")[:-2])
            # Calculate weight, for example, inversely proportional to the rank
            score = 1 / rank  # Adjust the weighting function if needed

            # Aggregate scores for each document
            doc_scores[doc_id] += score

        # Get the top 10 documents based on cumulative scores
        top_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)[:aggregate_num]
        results[query_id] = [doc for doc, score in top_docs]

    # Convert results to DataFrame for easier access
    top_results_df = pd.DataFrame(list(results.items()), columns=['id', 'docids'])
    return top_results_df

# Apply the function
top_results_df = rank_and_aggregate(top_k_chunks_df)
top_results_df.head()  # Display the top results

Unnamed: 0,id,docids
0,0,"[doc-en-0, doc-en-794977, doc-en-14117, doc-en..."
1,1,"[doc-en-16, doc-en-806801, doc-en-822499, doc-..."
2,2,"[doc-en-32, doc-it-14111, doc-en-659327, doc-e..."
3,3,"[doc-en-44779, doc-en-1915, doc-en-40, doc-de-..."
4,4,"[doc-en-56, doc-en-772504, doc-en-36769, doc-e..."


In [9]:
# Create the submission DataFrame
submission_df = pd.DataFrame(top_results_df)

# Save the submission file
submission_df.to_csv('/kaggle/working/submission.csv', index=False)