In [None]:
!pip install faiss-gpu
!pip install sentence_transformers
!pip install gdown

In [None]:
model_name = "/kaggle/input/checkpoint-10400/checkpoint-10400"

In [None]:
import gdown
gdown.download( "https://drive.google.com/uc?id=1_1tBqaJBGLALkpSAOmyJvvwE-QqiyRgxcw",  "/kaggle/working/template_chunking_flat.json", quiet=False)

In [None]:
import faiss
import numpy as np
import json
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

class EmbeddingModel:
    def __init__(self, model_name):
        self.model = self.load_model(model_name)

    def load_model(self, model_name):
        model = SentenceTransformer(model_name)
        return model

    def encode_embeddings(self, texts, batch_size=32):
        embeddings = []

        for i in tqdm(range(0, len(texts), batch_size), desc="Encoding batches"):
            batch = texts[i:i+batch_size]
            batch_embeddings = self.model.encode(batch, convert_to_numpy=True)
            embeddings.append(batch_embeddings)

        dense_embeddings = np.vstack(embeddings)

        return dense_embeddings

def read_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        json_data = json.load(file)
    return json_data

file_path = '/kaggle/working/template_chunking_flat.json' 
json_data = read_json_file(file_path)

texts = [entry["text"] for entry in json_data]
embedding_model = EmbeddingModel(model_name)
dense_embeddings = embedding_model.encode_embeddings(texts, batch_size=256)
embedding_dim = dense_embeddings.shape[1]  
index = faiss.IndexFlatIP(embedding_dim)

index.add(dense_embeddings)

faiss.write_index(index, 'bge_m3_23400.bin')

print("FAISS index saved to 'bge_m3.bin'")

