In [None]:
!pip install sentence-transformers faiss-cpu


In [3]:

import torch
from sentence_transformers import SentenceTransformer
import faiss
import json
import numpy as np


In [4]:

class VectorSearchWrapper:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.EMBED_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
        self.model = SentenceTransformer(self.EMBED_MODEL, device=self.device)
        self.json_file_path = 'combined files/cleaned_and_combined_hyd.json'
        self.index = None
        self.embeddings = []
        self.id_to_entry = {}
        self.run()

    def embedding(self, text_data):
        embedding = self.model.encode(text_data, convert_to_tensor=True, device=self.device)
        normalized_embedding = torch.nn.functional.normalize(embedding, p=2, dim=-1)
        return normalized_embedding.cpu().numpy()

    def preprocess_and_embed(self):
        embedded_list = []
        with open(self.json_file_path, 'r') as file:
            json_data = json.load(file)

        for entry in json_data:
            address = entry['Location']
            terms = [term.strip() for term in address.split(',')]
            replacable = ', '.join(terms[-4:]) if len(terms) > 4 else address
            
            entry['text data'] = entry['text data'].replace(address, replacable)
            text_data = entry["text data"].replace(f'Name: {entry["Name"]}', '')
            entry_id = entry.get("Id", None)
            
            if entry_id is not None:
                embedding = self.embedding(text_data)
                embedded_list.append({
                    "embedding": embedding,
                    "text": text_data,
                    "id": entry_id
                })

        return embedded_list

    def create_faiss_index(self):
        dimension = 384  # Embedding size of MiniLM
        self.index = faiss.IndexFlatL2(dimension)

    def insert_data(self):
        self.create_faiss_index()
        embeddings = np.vstack([entry["embedding"] for entry in self.embeddings])
        self.index.add(embeddings)

    def run(self):
        self.embeddings = self.preprocess_and_embed()
        self.insert_data()

    def search_faiss(self, query, k=3):
        query_embedding = self.embedding(query).reshape(1, -1)
        distances, indices = self.index.search(query_embedding, k)
        
        results = [{"id": self.embeddings[idx]["id"], "text": self.embeddings[idx]["text"]} for idx in indices[0]]
        return results


In [5]:
vector_search = VectorSearchWrapper()

In [6]:

# Override the search_faiss method if necessary
# vector_search.search_faiss = new_search_faiss.__get__(vector_search, VectorSearchWrapper)

# Call the method
query = "Schools in secunderabad"
results = vector_search.search_faiss(query)
print(results)


[{'id': 1912, 'text': '~Category: Public Schools~Location: Bolton Road, Opp. Tivoli Garden, Near JBS, Secunderabad-500003.~Faculty: ~Sports: Athletics, Tennis~Amenities: Laboratory, Computers Facility~Board: CBSE~Years: -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10~Fee: -1~Since: Not Available~Strength: Not Available~'}, {'id': 280, 'text': '~Category: Public Schools~Location: Beside Kalyani Theatre, Old Bowenpally, Hasmathpet, Secunderabad - 500011~Faculty: ~Sports: Athletics, Carroms, Chess, Karate, Skating, Yoga~Amenities: Transport, Laboratory, Smart Classrooms, Computers Facility, Library~Board: CBSE~Years: -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10~Fee: -1~Since: Not Available~Strength: Not Available~'}, {'id': 64, 'text': '~Category: Public Schools~Location: Saibaba Colony, Sitarampur, Bowenpally, Secunderabad - 500011.~Faculty: Mrs. T. Aruna Reddy Director~Sports: Athletics, Chess, Cricket, Football, Hockey, Swimming~Amenities: Transport, Laboratory, Smart Classrooms, Library~Board: CBSE~Year

In [7]:

# Test cases
with open('testcases/two variable testcases.json', 'r') as file:
    test_data = json.load(file)
print(len(test_data))


1131


In [8]:

p_count = 0
crct_count = 0
n_count = 0

for dictionary in test_data:
    data = vector_search.search_faiss(dictionary['query'],k=5)
    test_res = [item['id'] for item in data]
    stat = -1
    for i in test_res:
        if i in dictionary['ans']:
            p_count += 1
            stat = 0
        else:
            n_count += 1
    if stat == 0:
        crct_count += 1
    stat = -1

print(p_count, n_count, crct_count)


1157 4498 563
