<a href="https://colab.research.google.com/github/Suhail372/files_for_chatbot/blob/master/Vector_search_FAISS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/Suhail372/files_for_chatbot

Cloning into 'files_for_chatbot'...
remote: Enumerating objects: 135, done.[K
remote: Counting objects: 100% (135/135), done.[K
remote: Compressing objects: 100% (50/50), done.[K
remote: Total 135 (delta 92), reused 120 (delta 83), pack-reused 0 (from 0)[K
Receiving objects: 100% (135/135), 2.82 MiB | 15.88 MiB/s, done.
Resolving deltas: 100% (92/92), done.


In [2]:
!pip install sentence-transformers faiss-gpu


Collecting sentence-transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading sentence_transformers-3.1.1-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.3/245.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu, sentence-transformers
Successfully installed faiss-gpu-1.7.2 sentence-transformers-3.1.1


In [3]:
import os
os.chdir('/content/files_for_chatbot')

In [1]:
import os
import json
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss


In [7]:

class VectorSearchWrapper:
    def __init__(self, location="default"):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.EMBED_MODEL = 'sentence-transformers/paraphrase-MiniLM-L3-v2'
        self.model = SentenceTransformer(self.EMBED_MODEL, device=self.device)
        self.json_file_path = 'combined files/cleaned_and_combined_hyd.json'
        self.saved_vectors_path = 'saved_vectors'
        self.location = location
        self.index = None
        self.embeddings = []
        self.id_to_entry = {}
        self.run()

    def embedding(self, text_data):
        embedding = self.model.encode(text_data, convert_to_tensor=True, device=self.device)
        normalized_embedding = torch.nn.functional.normalize(embedding, p=2, dim=-1)
        return normalized_embedding.cpu().numpy()

    def preprocess_and_embed(self):
        embedded_list = []
        with open(self.json_file_path, 'r') as file:
            json_data = json.load(file)

        for entry in json_data:
            address = entry['Location']
            terms = [term.strip() for term in address.split(',')]
            replacable = ', '.join(terms[-4:]) if len(terms) > 4 else address

            entry['text data'] = entry['text data'].replace(address, replacable)
            text_data = entry["text data"].replace(f'Name: {entry["Name"]}', '')
            entry_id = entry.get("Id", None)

            if entry_id is not None:
                embedding = self.embedding(text_data)
                embedded_list.append({
                    "embedding": embedding,
                    "text": text_data,
                    "id": entry_id
                })

        return embedded_list

    def save_embeddings(self):
        if not os.path.exists(self.saved_vectors_path):
            os.makedirs(self.saved_vectors_path)
        
        embeddings = np.vstack([entry["embedding"] for entry in self.embeddings])
        ids = [entry["id"] for entry in self.embeddings]
        texts = [entry["text"] for entry in self.embeddings]
        
        np.save(os.path.join(self.saved_vectors_path, f'embeddings_{self.location}.npy'), embeddings)
        with open(os.path.join(self.saved_vectors_path, f'metadata_{self.location}.json'), 'w') as f:
            json.dump({"ids": ids, "texts": texts}, f)

    def load_embeddings(self):
        embeddings_path = os.path.join(self.saved_vectors_path, f'embeddings_{self.location}.npy')
        metadata_path = os.path.join(self.saved_vectors_path, f'metadata_{self.location}.json')
        
        if os.path.exists(embeddings_path) and os.path.exists(metadata_path):
            embeddings = np.load(embeddings_path)
            with open(metadata_path, 'r') as f:
                metadata = json.load(f)
            
            self.embeddings = [{"embedding": emb, "text": text, "id": id} for emb, text, id in zip(embeddings, metadata["texts"], metadata["ids"])]
            return True
        return False

    def create_faiss_index(self):
        dimension = 384  # Embedding size of MiniLM
        self.index = faiss.IndexFlatL2(dimension)

    def insert_data(self):
        self.create_faiss_index()
        embeddings = np.vstack([entry["embedding"] for entry in self.embeddings])
        self.index.add(embeddings)

    def run(self):
        if not self.load_embeddings():
            print("Embeddings not found. Preprocessing and creating new embeddings.")
            self.embeddings = self.preprocess_and_embed()
            self.save_embeddings()
        else:
            print("Embeddings loaded from saved files.")
        self.insert_data()

    def search_faiss(self, query, k=3):
        query_embedding = self.embedding(query).reshape(1, -1)
        distances, indices = self.index.search(query_embedding, k)

        results = [{"id": self.embeddings[idx]["id"], "text": self.embeddings[idx]["text"]} for idx in indices[0]]
        with open(self.json_file_path, 'r') as file:
            data = json.load(file)

        for i in results:
            for j in data:
                if i['id'] == j['Id']:
                    i['text'] = j['text data']

        return results


In [8]:
vector_search = VectorSearchWrapper()

Embeddings loaded from saved files.


In [9]:

# Override the search_faiss method if necessary
# vector_search.search_faiss = new_search_faiss.__get__(vector_search, VectorSearchWrapper)

# Call the method
query = "Schools in secunderabad"
results = vector_search.search_faiss(query)
print(results)


[{'id': 671, 'text': 'Name: Hi Tech Modern High School~Category: Public Schools~Location: Plot No. 114 & 115, Opp. Military Dairy Farm, Swarnadhama Nagar, Old Bowenpally, Secunderabad, Hyderabad, Telangana 500015, India~Faculty: Aishwarya pokuri Director / PG Diploma in Hospital Management Swathi Pokuri Director / PG~Sports: Athletics, Basketball, Karate, Kho-Kho, Skating, Swimming, Throwball, Volleyball, Yoga~Amenities: Transport, Medical Facility, Laboratory, Smart Classrooms, Computers Facility, Library~Board: CBSE, State Board~Years: -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10~Fee: 25000, 37000~Since: 1999~Strength: 450~'}, {'id': 1000, 'text': 'Name: Johnson Grammar School IBDP~Category: Day Schools~Location: Senior Wing, Plot No-A, 16, Nacharam - Mallapur Rd, Baba Nagar, Nacharam, Hyderabad, Secunderabad, Telangana 500076, India~Faculty: ~Sports: ~Amenities: Transport~Board: International Baccalaureate, International Baccalaureate~Years: 11, 12~Fee: 400000~Since: 2008~Strength: Not Avai

In [10]:

# Test cases
with open('testcases/two variable testcases.json', 'r') as file:
    test_data = json.load(file)
print(len(test_data))


1131


In [11]:

p_count = 0
crct_count = 0
n_count = 0

for dictionary in test_data:
    data = vector_search.search_faiss(dictionary['query'],k=3)
    test_res = [item['id'] for item in data]
    stat = -1
    for i in test_res:
        if i in dictionary['ans']:
            p_count += 1
            stat = 0
        else:
            n_count += 1
    if stat == 0:
        crct_count += 1
    stat = -1

print(p_count, n_count, crct_count)


1566 1827 774
