<a href="https://colab.research.google.com/github/Suhail372/files_for_chatbot/blob/master/Vector_search_FAISS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/Suhail372/files_for_chatbot

Cloning into 'files_for_chatbot'...
remote: Enumerating objects: 194, done.[K
remote: Counting objects: 100% (194/194), done.[K
remote: Compressing objects: 100% (82/82), done.[K
remote: Total 194 (delta 131), reused 160 (delta 109), pack-reused 0 (from 0)[K
Receiving objects: 100% (194/194), 12.38 MiB | 13.31 MiB/s, done.
Resolving deltas: 100% (131/131), done.


In [3]:
!pip install sentence-transformers faiss-gpu


Collecting sentence-transformers
  Using cached sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading sentence_transformers-3.1.1-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.3/245.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu, sentence-transformers
Successfully installed faiss-gpu-1.7.2 sentence-transformers-3.1.1


In [4]:
import os
os.chdir('/content/files_for_chatbot')

In [5]:
import os
import json
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss


  from tqdm.autonotebook import tqdm, trange


In [6]:
class VectorSearchWrapper:
    def __init__(self, location_is_hyd=False):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.EMBED_MODEL = 'sentence-transformers/paraphrase-MiniLM-L3-v2'
        self.location_is_hyd = location_is_hyd

        # Paths for Hyderabad and Bangalore JSON files
        self.hyd_json_file_path = 'combined files/cleaned_and_combined_hyd.json'
        self.blore_json_file_path = 'combined files/cleaned_and_combined_blore.json'

        self.model = SentenceTransformer(self.EMBED_MODEL, device=self.device)
        self.saved_vectors_path = 'saved_vectors'
        self.index_hyd = None
        self.index_blore = None
        self.embeddings_hyd = []
        self.embeddings_blore = []
        self.id_to_entry = {}
        self.run()

    def embedding(self, text_data):
        embedding = self.model.encode(text_data, convert_to_tensor=True, device=self.device)
        normalized_embedding = torch.nn.functional.normalize(embedding, p=2, dim=-1)
        return normalized_embedding.cpu().numpy()

    def preprocess_and_embed(self, json_file_path):
        embedded_list = []
        with open(json_file_path, 'r') as file:
            json_data = json.load(file)

        for entry in json_data:
            address = entry['Location']
            terms = [term.strip() for term in address.split(',')]
            replacable = ', '.join(terms[-4:]) if len(terms) > 4 else address

            entry['text data'] = entry['text data'].replace(address, replacable)
            text_data = entry["text data"].replace(f'Name: {entry["Name"]}', '')
            entry_id = entry.get("Id", None)

            if entry_id is not None:
                embedding = self.embedding(text_data)
                embedded_list.append({
                    "embedding": embedding,
                    "text": text_data,
                    "id": entry_id
                })

        return embedded_list

    def save_embeddings(self, embeddings, location_name):
        if not os.path.exists(self.saved_vectors_path):
            os.makedirs(self.saved_vectors_path)

        embeddings_array = np.vstack([entry["embedding"] for entry in embeddings])
        ids = [entry["id"] for entry in embeddings]
        texts = [entry["text"] for entry in embeddings]

        np.save(os.path.join(self.saved_vectors_path, f'embeddings_{location_name}.npy'), embeddings_array)
        with open(os.path.join(self.saved_vectors_path, f'metadata_{location_name}.json'), 'w') as f:
            json.dump({"ids": ids, "texts": texts}, f)

    def load_embeddings(self, location_name):
        embeddings_path = os.path.join(self.saved_vectors_path, f'embeddings_{location_name}.npy')
        metadata_path = os.path.join(self.saved_vectors_path, f'metadata_{location_name}.json')

        if os.path.exists(embeddings_path) and os.path.exists(metadata_path):
            embeddings_array = np.load(embeddings_path)
            with open(metadata_path, 'r') as f:
                metadata = json.load(f)

            embeddings = [{"embedding": emb, "text": text, "id": id} for emb, text, id in zip(embeddings_array, metadata["texts"], metadata["ids"])]
            return embeddings
        return None

    def create_faiss_index(self):
        dimension = 384  # Embedding size of MiniLM
        return faiss.IndexFlatL2(dimension)

    def insert_data(self, index, embeddings):
        embeddings_array = np.vstack([entry["embedding"] for entry in embeddings])
        index.add(embeddings_array)

    def run(self):
        self.embeddings_hyd = self.load_embeddings('hyd')
        self.embeddings_blore = self.load_embeddings('blore')

        if self.embeddings_hyd is None:
            print("Hyderabad embeddings not found. Preprocessing and creating new embeddings.")
            self.embeddings_hyd = self.preprocess_and_embed(self.hyd_json_file_path)
            self.save_embeddings(self.embeddings_hyd, 'hyd')
        if self.embeddings_blore is None:
            print("Bangalore embeddings not found. Preprocessing and creating new embeddings.")
            self.embeddings_blore = self.preprocess_and_embed(self.blore_json_file_path)
            self.save_embeddings(self.embeddings_blore, 'blore')

        self.index_hyd = self.create_faiss_index()
        self.index_blore = self.create_faiss_index()

        self.insert_data(self.index_hyd, self.embeddings_hyd)
        self.insert_data(self.index_blore, self.embeddings_blore)

    def search_faiss(self, query, k=3):
        query_embedding = self.embedding(query).reshape(1, -1)
        index = self.index_hyd if self.location_is_hyd else self.index_blore
        embeddings = self.embeddings_hyd if self.location_is_hyd else self.embeddings_blore
        json_file_path = self.hyd_json_file_path if self.location_is_hyd else self.blore_json_file_path

        distances, indices = index.search(query_embedding, k)

        results = [{"id": embeddings[idx]["id"], "text": embeddings[idx]["text"]} for idx in indices[0]]
        with open(json_file_path, 'r') as file:
            data = json.load(file)

        for i in results:
            for j in data:
                if i['id'] == j['Id']:
                    i['text'] = j['text data']

        return results

# Example usage:
# vector_search = VectorSearchWrapper(location_is_hyd=True)
# results = vector_search.search_faiss("query text")

In [7]:
vector_search = VectorSearchWrapper(location_is_hyd=True)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Hyderabad embeddings not found. Preprocessing and creating new embeddings.
Bangalore embeddings not found. Preprocessing and creating new embeddings.


In [9]:

# Override the search_faiss method if necessary
# vector_search.search_faiss = new_search_faiss.__get__(vector_search, VectorSearchWrapper)

# Call the method
query = "Schools in Whitefield"
vector_search.location_is_hyd = False
results = vector_search.search_faiss(query)
print(results)


[{'id': 1059, 'text': 'Name: Kidzee~Category: Play Schools~Location: No. 6 Green Park Opposite DTDC courier/Food World., Whitefield Main Rd, Whitefield, Bengaluru, Karnataka 560066~Faculty: Dr. Subhash Chandra Chairman / N/A~Sports: Yoga~Amenities: ~Board: Other Boards~Years: Nursery, LKG~Fee: 50000~Since: 2003~Strength: 85~'}, {'id': 2081, 'text': 'Name: Holy Cross School and PU College~Category: Public Schools~Location: Abhayadhama Road, Pattandur Agrahara, Whitefield, Whitefield, Bengaluru, Karnataka 560066~Faculty: K. JOHN JOSEPH CHAIRMAN / N/A~Sports: Athletics, Basketball, Carroms, Chess, Cricket, Football, Handball, Kabaddi, Kho-Kho, Table-tennis, Tennis, Throwball, Yoga~Amenities: Medical Facility, Laboratory, Computers Facility, Library~Board: CBSE~Years: LKG, UKG, 1, 2, 3, 4, 5, 6, 7, 8, 9~Fee: 50000~Since: 2002~Strength: Not Available~'}, {'id': 1568, 'text': 'Name: KiDeens~Category: Play Schools~Location: 7, 8 & 9,, 7, 8 & 9, Whitefield, Bengaluru, Karnataka 560066~Faculty:

In [None]:

# Test cases
with open('testcases/two variable testcases.json', 'r') as file:
    test_data = json.load(file)
print(len(test_data))


1130


In [None]:

p_count = 0
crct_count = 0
n_count = 0

for dictionary in test_data:
    data = vector_search.search_faiss(dictionary['query'],k=3)
    test_res = [item['id'] for item in data]
    stat = -1
    for i in test_res:
        if i in dictionary['ans']:
            p_count += 1
            stat = 0
        else:
            n_count += 1
    if stat == 0:
        crct_count += 1
    stat = -1

print(p_count, n_count, crct_count)


1550 1840 765
