In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import json

# Proper way to load standard JSON
with open("/kaggle/input/legal-corpus/legal_corpus.json", "r", encoding="utf-8") as f:
    corpus_data = json.load(f)  # ✅ returns list of dicts

with open("/kaggle/input/public-test-for-corpus/public_test.json", "r", encoding="utf-8") as f:
    query_data = json.load(f)

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True)

In [None]:

corpus_articles = []
for doc in corpus_data:
    for article in doc["content"]:
        corpus_articles.append({
            "aid": article["aid"],
            "content": article["content_Article"]
        })

# Prepare text lists
questions = [item["question"] for item in query_data]
documents = [item["content"] for item in corpus_articles]

max_chars = 1024
documents = [doc[:max_chars] for doc in documents]


# Encode questions
query_embeddings = model.encode(
    questions,
    task="retrieval.query",
    prompt_name="retrieval.query",
    convert_to_numpy=True
)

# Encode corpus documents
corpus_embeddings = model.encode(
    documents,
    task="retrieval.passage",
    prompt_name="retrieval.passage",
    convert_to_numpy=True,
    batch_size=16,  # Try 32 or even 16 if it still crashes
    show_progress_bar=True
)

# Optional: Map embeddings to aid for retrieval
aid_to_embedding = {
    article["aid"]: emb for article, emb in zip(corpus_articles, corpus_embeddings)
}

# Print shapes
print("Query embeddings shape:", query_embeddings.shape)
print("Corpus embeddings shape:", corpus_embeddings.shape)

In [None]:
!pip install -q faiss-cpu rank_bm25


In [None]:
import numpy as np
import faiss
import json
from rank_bm25 import BM25Okapi

# ------------ 1. Prepare Your Data ------------
# Your corpus_articles = list of {"aid": ..., "content_Article": ...}
# Your query_data = list of {"qid": ..., "question": ...}
# Your corpus_embeddings and query_embeddings = numpy arrays (float32, shape: [n_docs, dim])

# Step 1: BM25
tokenized_corpus = [doc['content'].split() for doc in corpus_articles]
bm25 = BM25Okapi(tokenized_corpus)

# Step 2: FAISS
corpus_embeddings = np.array(corpus_embeddings).astype("float32")
query_embeddings = np.array(query_embeddings).astype("float32")

faiss.normalize_L2(corpus_embeddings)
faiss.normalize_L2(query_embeddings)

dimension = corpus_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(corpus_embeddings)

# ------------ 2. Hybrid Search ------------
top_k = 50
alpha = 0.5  # Weight for BM25 vs embedding

top_k_results = []

for i, query in enumerate(query_data):
    tokenized_query = query["question"].split()

    # BM25 scores
    bm25_scores = np.array(bm25.get_scores(tokenized_query))

    # Embedding similarity scores
    D, I = index.search(query_embeddings[i].reshape(1, -1), len(corpus_articles))
    embedding_scores = D[0]  # [len(corpus)]

    # Normalize both scores
    bm25_norm = (bm25_scores - bm25_scores.min()) / (bm25_scores.max() - bm25_scores.min() + 1e-8)
    embed_norm = (embedding_scores - embedding_scores.min()) / (embedding_scores.max() - embedding_scores.min() + 1e-8)

    # Combine them
    hybrid_scores = alpha * bm25_norm + (1 - alpha) * embed_norm

    # Get top-k documents
    top_indices = np.argsort(hybrid_scores)[::-1][:top_k]
    top_aids = [corpus_articles[j]["aid"] for j in top_indices]

    top_k_results.append({
        "qid": query["id"],
        "relevant_laws": top_aids
    })

# ------------ 3. Save to JSON ------------
with open("hybrid_results.json", "w", encoding="utf-8") as f:
    json.dump(top_k_results, f, ensure_ascii=False, indent=2)


In [None]:
pip install -q -U FlagEmbedding


In [None]:
from FlagEmbedding import FlagReranker
model = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=True)


In [None]:
import torch
torch.cuda.empty_cache()
import wandb
wandb.login(key="d49a0c6fc5d5a70d03d24d2934d735b98e170a90")

In [None]:
    with torch.no_grad():
        scores = model.compute_score(pairs, max_length=1024, doc_type="text")

    # Sort by descending score
    reranked = sorted(zip(aids, scores), key=lambda x: x[1], reverse=True)


In [None]:
import json

# Strip to only include qid and relevant_laws
simplified_results = [
    {"qid": item["id"], "relevant_laws": item["relevant_laws"]}
    for item in reranked_results
]

# Save to JSON file
with open("reranked_results.json", "w", encoding="utf-8") as f:
    json.dump(simplified_results, f, ensure_ascii=False, indent=2)
