In [6]:
import json
from langchain_community.vectorstores import FAISS
import os

In [7]:
with open("RawJSONData.json",'r') as file:
    data=json.load(file)


In [8]:
chunks=[]
for item in data:
    text=f"Name: {item['name']}\nLanguage: {item['language']}\nid: {item['id']}\nbio: {item['bio']}\nversion: {item['version']}"
    chunks.append(text)

print(len(chunks))

15840


In [9]:
from sentence_transformers import SentenceTransformer
import numpy as np

model= SentenceTransformer("all-MiniLM-L6-v2")
embeddings=model.encode(chunks)
embeddings_np=np.array(embeddings)


In [10]:

print(embeddings.shape)

(15840, 384)


In [11]:
import faiss

dimension=embeddings_np.shape[1]
index=faiss.IndexFlatL2(dimension)

index.add(embeddings_np)

In [12]:
chunks[0]

'Name: Adeel Solangi\nLanguage: Sindhi\nid: V59OF92YF627HFY0\nbio: Donec lobortis eleifend condimentum. Cras dictum dolor lacinia lectus vehicula rutrum. Maecenas quis nisi nunc. Nam tristique feugiat est vitae mollis. Maecenas quis nisi nunc.\nversion: 6.1'

### dont

In [None]:
# from transformers import pipeline
# from langchain.prompts import PromptTemplate
# model_id='tiiuae/falcon-7b'


In [None]:
# text_generation_pipeline=pipeline(
#     "text-generation",
#     model=model_id,
#     model_kwargs={"torch_dtype":"auto"},
#     max_new_tokens=400, 
#     device=0

# )

### using LLAma

In [8]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.prompts.prompts import SimpleInputPrompt
from transformers import pipeline 
import json
from langchain_community.vectorstores import FAISS
import os
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

In [11]:
with open("RawJSONData.json",'r') as file:
    data=json.load(file)
chunks=[]
for item in data:
    text=f"Name: {item['name']}\nLanguage: {item['language']}\nid: {item['id']}\nbio: {item['bio']}\nversion: {item['version']}"
    chunks.append(text)
    
model= SentenceTransformer("all-MiniLM-L6-v2")
embeddings=model.encode(chunks, batch_size=512)

embeddings_np=np.array(embeddings)
# print(embeddings.shape)

dimension=embeddings_np.shape[1]
index=faiss.IndexFlatL2(dimension)
index.add(embeddings)
faiss.write_index(index, 'bio_index.faiss')
np.save('bios.npy', np.array(chunks))
print("✅ Embedding & FAISS Index Created Successfully 🚀🔥")
print(f"✅ Total Bios Embedded: {len(chunks)}")

✅ Embedding & FAISS Index Created Successfully 🚀🔥
✅ Total Bios Embedded: 15840


In [365]:
from sklearn.metrics.pairwise import cosine_similarity
import re
import random

most_relevant_bio=[]
faiss_index = faiss.read_index('bio_index.faiss')
chunks = np.load('bios.npy', allow_pickle=True)
query = "give me the details of aamir"

if len(query) > 10:
    query_embedding = model.encode([query])
    k = min(5, len(chunks)) 
    D, I = index.search(query_embedding, k=k)
    # print(D[0][0])
    if 0.9>D[0][0]<0.99:
        context = I[0][0]
        print(chunks[context])
    else:
        pass
else:
    print("Please write your question in more detail for better Answers")

if "and" in query.split():
    if "id" not in query.split() and "version" not in query.split():
        D, I = faiss_index.search(query_embedding, k=k)
        minimum_distance=D[0][0]
        for i in range(len(D[0])):
            if D[0][i]==minimum_distance:
                most_relevant_bio.append((D[0][i], chunks[I[0][i]]))
        print(len(most_relevant_bio))
        for dist, bio in most_relevant_bio:
            print("distance:", dist)
            print("bio", bio)
            print("\n")
    else:

        match_id=re.search(r'\b(?:id is|having id|id)\s+(\w+)', query, re.IGNORECASE)
        id=match_id.group(1)
        match_ver=re.search(r'\b(?:whos version is|version is|version|v is)\s+([\d]+(?:\.\d+)?)', query, re.IGNORECASE)
        version=match_ver.group(1)
        query_embedding = model.encode([query])
        k = len(chunks)
        D, I = faiss_index.search(query_embedding, k=min(15000, k))
        relevant_bios = [chunks[i] for i in I[0]]
        filtered_bios = [bio for bio in relevant_bios if id.lower() in bio.lower() and version.lower() in bio.lower()]
        print(len(filtered_bios))
        for bio in filtered_bios:
            print(bio)
            print("\n")
        # print(id, version)

else:
    alpha=[word for word in query.split() if any(c.isdigit() for c in word) and any(c.isalpha() for c in word)]
    if alpha:
        alpha=alpha[0]
        query_embedding = model.encode([query])
        k = len(chunks)
        D, I = faiss_index.search(query_embedding, k=min(15000, k))
        relevant_bios = [chunks[i] for i in I[0]]
        filtered_bios = [bio for bio in relevant_bios if alpha.lower() in bio.lower()]
        # print(len(filtered_bios))
        for bio in filtered_bios:
            print(bio)
            print("\n")

    match=re.search(r'\b(?:language is|speaks|speaking|knows)\s+(\w+)', query, re.IGNORECASE)
    if match:
        language = match.group(1) if match else None
        if language:
            query_embedding = model.encode([query])
            k = len(chunks)
            D, I = faiss_index.search(query_embedding, k=min(15000, k))
            relevant_bios = [chunks[i] for i in I[0]]
            filtered_bios = [bio for bio in relevant_bios if language.lower() in bio.lower()]
            # print(len(filtered_bios))
            for bio in filtered_bios:
                print(bio)
                print("\n")

    if "id" in query.split():
        match=re.search(r'\b(?:id is|having id|id)\s+(\w+)', query, re.IGNORECASE)
        id=match.group(1)
        # print(id)
        query_embedding = model.encode([query])
        k = len(chunks)
        D, I = faiss_index.search(query_embedding, k=min(15000, k))
        relevant_bios = [chunks[i] for i in I[0]]
        filtered_bios = [bio for bio in relevant_bios if id in bio]
        # print(len(filtered_bios))
        for bio in filtered_bios:
            print(bio)
            print("\n")

    if "version" in query.split():
        match=re.search(r'\b(?:whos version is|version is|version|v is)\s+([\d]+(?:\.\d+)?)', query, re.IGNORECASE)
        version=match.group(1)
        # print(version)
        query_embedding = model.encode([query])
        k = len(chunks)
        D, I = faiss_index.search(query_embedding, k=min(15000, k))
        relevant_bios = [chunks[i] for i in I[0]]
        filtered_bios = [bio for bio in relevant_bios if version in bio]
        # print(len(filtered_bios))
        for bio in filtered_bios:
            print(bio)
            print("\n")


# corrected version 2.0

fully working correctly

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import re
import spacy
from rapidfuzz import process
import faiss
import numpy as np
import json
import json
import os
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss


model= SentenceTransformer("all-MiniLM-L6-v2")

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

most_relevant_bio=[]
faiss_index = faiss.read_index('bio_index.faiss')
chunks = np.load('bios.npy', allow_pickle=True)

with open("RawJSONData.json", "r") as file:
    data_base = json.load(file)

name_list = [entry["name"] for entry in data_base]
language_list=list(set([entry["language"] for entry in data_base]))
bio_list=[entry["bio"] for entry in data_base]

def find_relevance_name(filtered_query):
    highest_match = None
    highest_score = 0
    for name in filtered_query.split():
        best_match_name = process.extractOne(name, name_list, score_cutoff=70)
        best_match_language = process.extractOne(name, language_list, score_cutoff=70)

        # print(best_match_name, best_match_language)
        if best_match_name:
            match_name, score, _ = best_match_name
            if score > highest_score:
                highest_score = score
                highest_match = match_name
        elif best_match_language:
            match_name, score, _ = best_match_language
            if score > highest_score:
                highest_score = score
                highest_match = match_name 
        else:
            pass
        
    if highest_match:
        query_embedding = model.encode([query])
        k = len(chunks)
        D, I = faiss_index.search(query_embedding, k=min(len(chunks), k))
        relevant_bios = [chunks[i] for i in I[0]]
        filtered_bios = [bio for bio in relevant_bios if highest_match in bio]
        for bio in filtered_bios:
            print(bio)
            print("\n")
    else:
        print("Please write the question properly")


def find_relevance_id(id_number):
    for id in id_number:
        query_embedding = model.encode([id])
        k = len(chunks)
        D, I = faiss_index.search(query_embedding, k=min(len(chunks), k))
        relevant_bios = [chunks[i] for i in I[0]]
        filtered_bios = [bio for bio in relevant_bios if id in bio]
        for bio in filtered_bios:
            print(bio)
            print("\n")
        if len(filtered_bios)==0:
            print("No Such ID")

def remove_stop_words(query):
    doc = nlp(query)
    grammar_words = [token.text.lower() for token in doc if token.is_stop]
    return grammar_words



query = "Sindhi"

best_match_bio = process.extractOne(query, bio_list, score_cutoff=90)

if best_match_bio:
    query_embedding = model.encode([query])
    k = len(chunks)
    D, I = faiss_index.search(query_embedding, k=min(len(chunks), k))
    relevant_bios = [chunks[i] for i in I[0]]
    filtered_bios = [bio for bio in relevant_bios if query in bio]
    for bio in filtered_bios:
        print(bio)
        print("\n")
else:
    grammer_words=remove_stop_words(query=query)
    filter_query=[words.lower() for words in query.split()]
    filter_query=[word.capitalize() for word in filter_query if word not in grammer_words]
    filter_query=" ".join(filter_query)
    # print(filter_query)
    id=False
    id_number=[]
    for words in filter_query.split():
        if not words.isalpha():
            id_number.append(words.upper())
            id=True
    if id:
        find_relevance_id(id_number)
    else:
        find_relevance_name(filter_query)
    


Name: Rashid Rajput
Language: Sindhi
id: UNBGUGDUATATCLS4
bio: Donec congue sapien vel euismod interdum. Maecenas quis nisi nunc.
version: 8.51


Name: Rashid Rajput
Language: Sindhi
id: UNBGUGDUATATCLS4
bio: Donec congue sapien vel euismod interdum. Maecenas quis nisi nunc.
version: 8.51


Name: Rashid Rajput
Language: Sindhi
id: UNBGUGDUATATCLS4
bio: Donec congue sapien vel euismod interdum. Maecenas quis nisi nunc.
version: 8.51


Name: Rashid Rajput
Language: Sindhi
id: UNBGUGDUATATCLS4
bio: Donec congue sapien vel euismod interdum. Maecenas quis nisi nunc.
version: 8.51


Name: Rashid Rajput
Language: Sindhi
id: UNBGUGDUATATCLS4
bio: Donec congue sapien vel euismod interdum. Maecenas quis nisi nunc.
version: 8.51


Name: Rashid Rajput
Language: Sindhi
id: UNBGUGDUATATCLS4
bio: Donec congue sapien vel euismod interdum. Maecenas quis nisi nunc.
version: 8.51


Name: Rashid Rajput
Language: Sindhi
id: UNBGUGDUATATCLS4
bio: Donec congue sapien vel euismod interdum. Maecenas quis nisi 

# deciding the dataset or routing the query to datasets

In [1]:
from sklearn.metrics.pairwise import cosine_similarity
import re
import spacy
from rapidfuzz import process
import faiss
import numpy as np
import json
import os
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

key_value_store = {}


with open("archive/US_recipes.json",'r') as file:
    data_set=json.load(file)



  from .autonotebook import tqdm as notebook_tqdm


# ALL IN ONE

### Rag system of all in one

In [10]:
import re

def remove_stop_words(query):
    doc = nlp(query)
    grammar_words = [token.text.lower() for token in doc if token.is_stop]
    return grammar_words


def removing_unecessary_words(query):
    query = query.lower().strip().replace('"', '')
    grammer_words=remove_stop_words(query=query)
    filter_query=[words.lower() for words in query.split()]
    filter_query=[word.capitalize() for word in filter_query if word not in grammer_words]
    filter_query=" ".join(filter_query)
    # print(filter_query)
    return filter_query


def is_id(query):
    return bool(re.fullmatch(r"(?=.*[A-Za-z])(?=.*\d)[A-Za-z0-9]{16}", query))

def customers_data(query):

    print("in customer data")

    # most_relevant_bio=[]
    # faiss_index = faiss.read_index('bio_index.faiss')
    # chunks = np.load('bios.npy', allow_pickle=True)

    # with open("RawJSONData.json", "r") as file:
    #     data_base = json.load(file)

    # name_list = [entry["name"] for entry in data_base]
    # language_list=list(set([entry["language"] for entry in data_base]))
    # bio_list=[entry["bio"] for entry in data_base]

    # def find_relevance_name(filtered_query):
    #     highest_match = None
    #     highest_score = 0
    #     for name in filtered_query.split():
    #         best_match_name = process.extractOne(name, name_list, score_cutoff=70)
    #         best_match_language = process.extractOne(name, language_list, score_cutoff=70)

    #         # print(best_match_name, best_match_language)
    #         if best_match_name:
    #             match_name, score, _ = best_match_name
    #             if score > highest_score:
    #                 highest_score = score
    #                 highest_match = match_name
    #         elif best_match_language:
    #             match_name, score, _ = best_match_language
    #             if score > highest_score:
    #                 highest_score = score
    #                 highest_match = match_name 
    #         else:
    #             pass

    #     if highest_match:
    #         query_embedding = model.encode([query])
    #         k = len(chunks)
    #         D, I = faiss_index.search(query_embedding, k=min(len(chunks), k))
    #         relevant_bios = [chunks[i] for i in I[0]]
    #         filtered_bios = [bio for bio in relevant_bios if highest_match in bio]
    #         for bio in filtered_bios:
    #             print(bio)
    #             print("\n")
    #     else:
    #         print("Please write the question properly")
    
    # def find_relevance_id(id_number):
    #     for id in id_number:
    #         query_embedding = model.encode([id])
    #         k = len(chunks)
    #         D, I = faiss_index.search(query_embedding, k=min(len(chunks), k))
    #         relevant_bios = [chunks[i] for i in I[0]]
    #         filtered_bios = [bio for bio in relevant_bios if id in bio]
    #         for bio in filtered_bios:
    #             print(bio)
    #             print("\n")
    #         if len(filtered_bios)==0:
    #             pass

    # best_match_bio = process.extractOne(query, bio_list, score_cutoff=80)
    # # print(best_match_bio)

    # if best_match_bio:
    #     query_embedding = model.encode([query])
    #     k = len(chunks)
    #     D, I = faiss_index.search(query_embedding, k=min(len(chunks), k))
    #     relevant_bios = [chunks[i] for i in I[0]]
    #     # print(query)
    #     filtered_bios = [bio for bio in relevant_bios if best_match_bio[0] in bio]
    #     for bio in filtered_bios:
    #         print(bio)
    #         print("\n")
    # else:
    #     grammer_words=remove_stop_words(query=query)
    #     filter_query=[words.lower() for words in query.split()]
    #     filter_query=[word.capitalize() for word in filter_query if word not in grammer_words]
    #     filter_query=" ".join(filter_query)
    #     # print(filter_query)
    #     id=False
    #     id_number=[]
    #     for words in filter_query.split():
    #         if not words.isalpha():
    #             id_number.append(words.upper())
    #             id=True
    #     if id:
    #         find_relevance_id(id_number)
    #     else:
    #         find_relevance_name(filter_query)
#<----------------------------------------end of customer data------------------------------------------------------>

def us_data(query):
    print("in US data")

    # cuisine_list = [data.get("cuisine", 0) for data in data_base.values()]
    # Continent_list = [data.get("Contient", 0) for data in data_base.values()]
    # # Country_State_list = [data.get("Country_State", 0) for data in data_base.values()] because of US
    # Title_list = [data.get("title", 0) for data in data_base.values()]
    # ingrediants_list=[data.get("ingredients", 0) for data in data_base.values()] 


    # def remove_stop_words(query):
    #     doc = nlp(query)
    #     comparison_pattern = re.compile(r'\b(?:more|greater|less|below|above|over|under|equal|equals|and|or|made|contain|has|have|>=|<=|>|<)\b')
    #     grammar_words = []
    #     for token in doc:
    #         if token.is_stop and not (comparison_pattern.search(token.text.lower()) or token.like_num):
    #             grammar_words.append(token.text.lower())
        
    #     # print(grammar_words)
    #     return grammar_words

    # def find_time(query):
    #     # print("inside time")
    #     time_mapping = {
    #     "total_time": ["total duration", "overall time", "complete time", "total cook time"],
    #     "prep_time": ["preparation time", "prep duration", "time to prepare", "prep period"],
    #     "cook_time": ["cooking time", "cook duration", "time to cook", "cook period"],
    #     }

    #     best_match, best_score, best_key = None, 0, None
    #     for time, synonyms in time_mapping.items():
    #         match = process.extractOne(query, synonyms)  # Fuzzy match
    #         if match and match[1] > best_score:
    #             best_match, best_score, best_key = match[0], match[1], time
    #     if best_key:
    #         best_key=best_key
            
    #     print(best_key)
    #     match = re.search(r'\d+(\.\d+)?', query)
    #     if match:
    #         time=float(match.group())
    #         if time:
    #             for key, bio in data_base.items():
    #                 time_stramps=bio.get(best_key, 0)
    #                 if re.search(r'\b(less|atmost|max|maximum)\b', query):
    #                     if time<time_stramps:
    #                         for key, value in bio.items():
    #                             print(f"{key}: {value}")
    #                         print("\n")
    #                 elif re.search(r'\b(atmost|max|maximum)\b', query):
    #                     if time<=time_stramps:
    #                         for key, value in bio.items():
    #                             print(f"{key}: {value}")
    #                         print("\n")
    #                 elif re.search(r'\b(more)\b', query):
    #                     if time>time_stramps:
    #                         for key, value in bio.items():
    #                             print(f"{key}: {value}")
    #                         print("\n")
    #                 elif re.search(r'\b(atleast|min|minimum)\b', query):
    #                     if time>time_stramps:
    #                         for key, value in bio.items():
    #                             print(f"{key}: {value}")
    #                         print("\n")
    #                 else:
    #                     if time==time_stramps:
    #                         for key, value in bio.items():
    #                             print(f"{key}: {value}")
    #                         print("\n")
    # def find_serve(query):
    #     print(query)
    #     print("inside serve")
    #     match=re.search(r'\d+(\.\d+)?', query)
    #     if match:
    #         serve=int(match.group())
    #     for key, bio in data_base.items():
    #         serves=bio.get("serves", 0)
    #         if f"{serve} servings"==serves:
    #             for key, value in bio.items():
    #                 print(f"{key}: {value}")
    #             print("\n")

    # def find_ratting(query, data_base):
    #     print(query)
    #     match = re.search(r'\d+(\.\d+)?', query)
    #     if match:
    #         ratting=float(match.group())
    #     for key, bio in data_base.items():
    #         rating=bio.get("rating", 0)
    #         if rating==ratting:
    #             for key, value in bio.items():
    #                 print(f"{key}: {value}")
    #             print("\n")
    #     return
    
    # def find_relevance(query):
    #     # print(query)
    #     match = None
    #     best_match_cuisines= process.extractOne(query, cuisine_list, score_cutoff=70)
    #     best_match_Continents = process.extractOne(query, Continent_list, score_cutoff=70)
    #     # best_match_serves = process.extractOne(query, serve_list, score_cutoff=70)
    #     best_match_Title = process.extractOne(query, Title_list, score_cutoff=70)
    #     # print(best_match_cuisines,best_match_Continents, best_match_serves ,best_match_Title)
    #     matches = [match for match in [best_match_cuisines, best_match_Continents, best_match_Title] if match is not None]
    #     # print(matches)
    #     if best_match_cuisines:
    #         match, _ , _ = best_match_cuisines
    #     elif best_match_Continents:
    #         match, _ , _ = best_match_Continents
    #     # elif best_match_Country_State:
    #     #     match, _ , _ = best_match_Country_State
    #     elif best_match_Title:
    #         match, _ , _ = best_match_Title
    #     else:
    #         pass

    #     if match:
    #         query_embedding=model.encode([query])
    #         k=len(chunks)
    #         D,I = faiss_index.search(query_embedding, k=min(len(chunks), k))
    #         relevant_bios=[chunks[i] for i in I[0]]
    #         filtered_bios=[bio for bio in relevant_bios if match in bio]
    #         for bio in filtered_bios:
    #             print(bio)
    #             print("\n")
    #     else:
    #         query_embedding=model.encode([query])
    #         k=len(chunks)
    #         D,I = faiss_index.search(query_embedding, k=min(len(chunks), 5))
    #         relevant_bios=[chunks[i] for i in I[0]]
    #         filtered_bios=[bio for bio in relevant_bios]
    #         for bio in filtered_bios:
    #             print(bio)
    #             print("\n")
        
    # def find_nutrients(query):

    #     nutrient_mapping = {
    #     "calories": ["calorie", "cal", "kcal", "calor", "cals", "kal"],
    #     "carbohydrateContent": ["carbs", "carbohydrate", "carb", "carbo", "crabs", "cabro", "cabrohydrate"],
    #     "cholesterolContent": ["cholesterol", "cholestrol", "cholesteral", "chol", "cholestral"],
    #     "fiberContent": ["fiber", "fibre", "fibber", "fyber", "fibar", "roughage", "crude fiber"],
    #     "proteinContent": ["protein", "protien", "proten", "proetin", "protine", "proteine", "protane"],
    #     "saturatedFatContent": ["saturated fat", "sat fat", "satfat", "saturated", "sat_fat"],
    #     "sodiumContent": ["sodium", "salt", "sodim", "sodum", "na", "sod", "NA"],
    #     "sugarContent": ["sugar", "sugr", "suggar", "sug", "sugr"],
    #     "fatContent": ["fat", "fats", "fatt", "fett", "fatty"],
    #     "unsaturatedFatContent": ["unsaturated fat", "unsat fat", "unsatfat", "unsaturated", "unsat_fat"]
    # }

    #     best_match, best_score, best_key = None, 0, None
    #     for nutrient, synonyms in nutrient_mapping.items():
    #         match = process.extractOne(query, synonyms)  # Fuzzy match
    #         if match and match[1] > best_score:
    #             best_match, best_score, best_key = match[0], match[1], nutrient
    #     if best_key:
    #         best_key=best_key

    #     match = re.search(r'\d+(\.\d+)?', query)
    #     if match:
    #         nutrient=float(match.group())
    #         if nutrient:
    #             if re.search(r'\b(less|atmost|max|maximum)\b', query):
    #                 # print(query)
    #                 # print("inside less")
    #                 for key, recipe in data_base.items():
    #                     nutrients = recipe.get("nutrients", {})
    #                     if isinstance(nutrients, dict):
    #                         for nutrient_key, nutrient_value in nutrients.items():
    #                             if nutrient_key==best_key:
    #                                 match_value = re.search(r'\d+', str(nutrient_value))  # Convert to string to handle any type
    #                                 if match_value and int(match_value.group()) <= nutrient:
    #                                     print(f"Found match in recipe: {recipe.get('title', 'Unknown Recipe')}")
    #                                     print(f"Nutrient: {nutrient_key} - {nutrient_value}")
    #                                     for key, value in recipe.items():
    #                                         print(f"{key}: {value}")
    #                                     print("\n")
    #             elif re.search(r'\b(more|atleast|min|minimum)\b', query):
    #                 for key, recipe in data_base.items():
    #                     nutrients = recipe.get("nutrients", {})
    #                     if isinstance(nutrients, dict):
    #                         for nutrient_key, nutrient_value in nutrients.items():
    #                             if nutrient_key==best_key:
    #                                 match_value = re.search(r'\d+', str(nutrient_value))  # Convert to string to handle any type
    #                                 if match_value and int(match_value.group()) >= nutrient:
    #                                     print(f"Found match in recipe: {recipe.get('title', 'Unknown Recipe')}")
    #                                     print(f"Nutrient: {nutrient_key} - {nutrient_value}")
    #                                     for key, value in recipe.items():
    #                                         print(f"{key}: {value}")
    #                                     print("\n")
    #             else:
    #                 print("no such recepie")
    #         else:
    #             if re.search(r"\b(no|doesn't|dont|doesn’t|does'nt|does not|not|avoid|avoids|none|never|without|lack|lacks|free of|excluding|excluded|omit|omits|nothing|neither|nor|zero)\b", query):
    #                 for key, recipe in data_base.items():
    #                     nutrients = recipe.get("nutrients", {})
    #                     if isinstance(nutrients, dict):
    #                         for nutrient_key, nutrient_value in nutrients.items():
    #                             if nutrient_key==best_key:
    #                                 match_value = re.search(r'\d+', str(nutrient_value))  # Convert to string to handle any type
    #                                 if match_value and int(match_value.group()) == 0:
    #                                     print(f"Found match in recipe: {recipe.get('title', 'Unknown Recipe')}")
    #                                     print(f"Nutrient: {nutrient_key} - {nutrient_value}")
    #                                     for key, value in recipe.items():
    #                                         print(f"{key}: {value}")
    #                                     print("\n")
    #             else:
    #                 # print("no such recepie")
    #                 pass
    
    # def find_ingrediants(query):
    #     best_match, best_score, best_key = None, 0, None
        
    #     matched_recepies=[]
    #     for ingridant in ingrediants_list:
    #         match = process.extractOne(query, ingridant)  # Fuzzy match
    #         if match and match[1] > best_score:
    #             best_match, best_score, best_key = match[0], match[1], ingridant
    #     if best_key:
    #         best_key=best_key
    #     # print(best_key)

    #     flattened_ingredients = " ".join(best_key).lower()
    #     keywords=[i for i in query.split()]
    #     matched_ingrediant=[keyword for keyword in keywords if keyword.lower() in flattened_ingredients]
    #     # print(matched_ingrediant)
    #     matched_ingrediant_prompt=" ".join(matched_ingrediant)
    #     print(matched_ingrediant_prompt)
    #     for key, value in data_base.items():
    #         ingredients = value.get("ingredients", {})
    #         ingrediants_lower=[ingridiant.lower() for ingridiant in ingredients]
    #         if any(matched_ingrediant_prompt in ingrediant for ingrediant in ingrediants_lower):
    #             # print(value)
    #             # matched_recepies.append((key, value))
    #             for key, value in value.items():
    #                 print(f"{key}: {value}")
    #             print("\n")
                
    # # query="give me the recepies which made with 2 eggs"
    # query = query.lower().strip().replace('"', '')
    # grammer_words=remove_stop_words(query=query)
    # # print(grammer_words)
    # query=[words for words in query.split() if words not in grammer_words]
    # query=" ".join(query)
    # print(query)

    # nutrients_pattern = r'\b\d+(\.\d+)?\s*(kcal|mg|g)\b'
    # time_pattern = r'\b(time|mins|min)\b'
    # ratting_pattern = r'\b\d+\.\d+\b'
    # serves_pattern=r'\b(sreves|people|serve|group of|individuals|portions|feed)\b'
    # ingrediant_pattern=r'\b(contains|has|have|made|contain)\b'

    # if re.search(nutrients_pattern, query):
    #     find_nutrients(query)
    # elif re.search(time_pattern, query, re.IGNORECASE):
    #     find_time(query)
    # elif re.search(ratting_pattern, query):
    #     find_ratting(query, data_base)
    # elif re.search(serves_pattern, query):
    #     find_serve(query)
    # elif re.search(ingrediant_pattern,query):
    #     find_ingrediants(query)
    # else:
    #     find_relevance(query)


#<--------------------------------------------end if US data-------------------------------------------------------->
    


def prompt_router(query):
    for i in query.split():
        if is_id(i):
            # print(i)
            # print(f"customer data")
            data_label="customer data"
            customers_data(query)
        
    query_embedding = model.encode(query).reshape(1, -1) 
    # print(query_embedding.shape,cust_embeddings.shape)
    if cust_embeddings.shape != query_embedding.shape:  
        num_vectors = cust_embeddings.shape[1] // query_embedding.shape[1]  
        edited_cust_embeddings = cust_embeddings.reshape(num_vectors, query_embedding.shape[1])

    # if us_embeddings.shape != query_embedding.shape:  
    #     num_vectors = cust_embeddings.shape[1] // query_embedding.shape[1]  
    #     edited_us_embeddings = us_embeddings.reshape(num_vectors, query_embedding.shape[1])  

    similarity_cust = max(cosine_similarity(query_embedding, edited_cust_embeddings)[0])
    similarity_US = max(cosine_similarity(query_embedding, us_embeddings)[0])
    print(similarity_cust, similarity_US)
    if similarity_cust>similarity_US:
        data_label="customer data"
    else:
        data_label="us data"
    if data_label=="customer data":
        customers_data(query)
    else:
        us_data(query)
    
        
query ="give me the recipie which takes 5 mins"
prompt_router(query)

0.18097597 0.08303637
in customer data


making encoded files

In [None]:
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

def load_json(file_path):
    """Loads JSON data from a given file path."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def flatten_dict(d, parent_key="", sep=" "):
    """Flattens a nested dictionary into a single string with key-value pairs."""
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}".strip()
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        elif isinstance(v, list):
            items.append((new_key, ", ".join(map(str, v))))
        else:
            items.append((new_key, str(v)))
    return dict(items)

def create_text_chunks(data):
    """Creates text chunks from nested JSON data."""
    chunks = []
    for item in data.values():  # Adjusting for numeric keys
        flat_item = flatten_dict(item)
        text = "\n".join([f"{key.capitalize()}: {value}" for key, value in flat_item.items()])
        chunks.append(text)
    return chunks

def generate_embeddings(chunks, model_name="all-MiniLM-L6-v2", batch_size=512):
    """Generates embeddings using the SentenceTransformer model."""
    model = SentenceTransformer(model_name)
    return np.array(model.encode(chunks, batch_size=batch_size))

def create_faiss_index(embeddings, index_path):
    """Creates and saves a FAISS index."""
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    faiss.write_index(index, index_path)

def save_text_chunks(chunks, output_path):
    """Saves text chunks as a NumPy file."""
    np.save(output_path, np.array(chunks))

def main(json_path, index_path="bio_index_us.faiss", chunks_path="bios_us.npy"):
    """Main function to process data, create embeddings, and store FAISS index."""
    data = load_json(json_path)
    chunks = create_text_chunks(data)
    embeddings = generate_embeddings(chunks)
    create_faiss_index(embeddings, index_path)
    save_text_chunks(chunks, chunks_path)
    
    print("✅ Embedding & FAISS Index Created Successfully 🚀🔥")
    print(f"✅ Total Records Embedded: {len(chunks)}")

# Example Usage
if __name__ == "__main__":
    main("archive/US_recipes.json")


  from .autonotebook import tqdm as notebook_tqdm


✅ Embedding & FAISS Index Created Successfully 🚀🔥
✅ Total Records Embedded: 8451


Rag system for us main code

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
import re
import spacy
from rapidfuzz import process
import faiss
import numpy as np
import json
import os
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

model= SentenceTransformer("all-MiniLM-L6-v2")

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

most_relevant_bio=[]
faiss_index = faiss.read_index('bio_index_us.faiss')
chunks = np.load('bios_us.npy', allow_pickle=True)

with open("archive/US_recipes.json", "r") as file:
    data_base = json.load(file)


  from .autonotebook import tqdm as notebook_tqdm


In [52]:
cuisine_list = [data.get("cuisine", 0) for data in data_base.values()]
Continent_list = [data.get("Contient", 0) for data in data_base.values()]
Country_State_list = [data.get("Country_State", 0) for data in data_base.values()]
Title_list = [data.get("title", 0) for data in data_base.values()]
ingrediants_list=[data.get("ingredients", 0) for data in data_base.values()] 

In [53]:
ingrediants_list=[ingrediants for ingrediants in ingrediants_list ]
new_ingrediant=[]

In [55]:
def remove_numbers(ingredient):
    # Removes digits, decimals, and fractions like 1/2
    return re.sub(r'[\d\.\/]+(?:\s*[\w\(\)]+)?', '', ingredient).strip()



cleaned_ingredients = []
for ingredient_group in ingrediants_list:
    for ingredient in ingredient_group:
        cleaned_ingredients.append(remove_numbers(ingredient))
        
        



In [56]:
cleaned_ingredients

['pound) sweet potato, with skin',
 'butter, softened',
 'white sugar',
 'milk',
 'eggs',
 'ground nutmeg',
 'ground cinnamon',
 'vanilla extract',
 'inch) unbaked pie crust',
 'peaches - peeled, pitted and sliced into thin wedges',
 'white sugar',
 'brown sugar',
 'ground cinnamon',
 'ground nutmeg',
 'fresh lemon juice',
 'cornstarch',
 'all-purpose flour',
 'white sugar',
 'brown sugar',
 'baking powder',
 'salt',
 'unsalted butter, chilled and cut into small pieces',
 'boiling water',
 'white sugar',
 'ground cinnamon',
 'green tomatoes',
 '',
 'milk',
 'all-purpose flour',
 'cornmeal',
 'bread crumbs',
 'coarse kosher salt',
 'ground black pepper',
 'vegetable oil for frying',
 'peanut oil, divided',
 'Cajun seasoning',
 'andouille sausage, sliced into rounds',
 'boneless skinless chicken breasts, cut into  pieces',
 ', diced',
 'green bell pepper, diced',
 'celery, diced',
 'garlic, minced',
 'ounce) can crushed Italian tomatoes',
 'red pepper flakes',
 'ground black pepper',
 's

In [59]:
print(process.extractOne("almnd", cleaned_ingredients, score_cutoff=70))

('almonds', 83.33333333333334, 35632)


In [None]:
def remove_stop_words(query):
    doc = nlp(query)
    comparison_pattern = re.compile(r'\b(?:more|greater|less|below|above|over|under|equal|equals|and|or|made|contain|has|have|>=|<=|>|<)\b')
    grammar_words = []
    for token in doc:
        if token.is_stop and not (comparison_pattern.search(token.text.lower()) or token.like_num):
            grammar_words.append(token.text.lower())
    
    # print(grammar_words)
    return grammar_words

def find_time(query):
    # print("inside time")
    time_mapping = {
    "total_time": ["total duration", "overall time", "complete time", "total cook time"],
    "prep_time": ["preparation time", "prep duration", "time to prepare", "prep period"],
    "cook_time": ["cooking time", "cook duration", "time to cook", "cook period"],
    }

    best_match, best_score, best_key = None, 0, None
    for time, synonyms in time_mapping.items():
        match = process.extractOne(query, synonyms)  # Fuzzy match
        if match and match[1] > best_score:
            best_match, best_score, best_key = match[0], match[1], time
    if best_key:
        best_key=best_key
        
    print(best_key)

    match = re.search(r'\d+(\.\d+)?', query)
    if match:
        time=float(match.group())
        if time:
            for key, bio in data_base.items():
                time_stramps=bio.get(best_key, 0)
                if re.search(r'\b(less|atmost|max|maximum)\b', query):
                    if time<time_stramps:
                        for key, value in bio.items():
                            print(f"{key}: {value}")
                        print("\n")
                elif re.search(r'\b(atmost|max|maximum)\b', query):
                    if time<=time_stramps:
                        for key, value in bio.items():
                            print(f"{key}: {value}")
                        print("\n")
                elif re.search(r'\b(more)\b', query):
                    if time>time_stramps:
                        for key, value in bio.items():
                            print(f"{key}: {value}")
                        print("\n")
                elif re.search(r'\b(atleast|min|minimum)\b', query):
                    if time>time_stramps:
                        for key, value in bio.items():
                            print(f"{key}: {value}")
                        print("\n")
                else:
                    if time==time_stramps:
                        for key, value in bio.items():
                            print(f"{key}: {value}")
                        print("\n")

def find_serve(query):
    print(query)
    print("inside serve")
    match=re.search(r'\d+(\.\d+)?', query)
    if match:
        serve=int(match.group())
    for key, bio in data_base.items():
        serves=bio.get("serves", 0)
        if f"{serve} servings"==serves:
            for key, value in bio.items():
                print(f"{key}: {value}")
            print("\n")

def find_ratting(query, data_base):
    print(query)
    match = re.search(r'\d+(\.\d+)?', query)
    if match:
        ratting=float(match.group())
    for key, bio in data_base.items():
        rating=bio.get("rating", 0)
        if rating==ratting:
            for key, value in bio.items():
                print(f"{key}: {value}")
            print("\n")
    return

def find_relevance(query):
    # print(query)
    match = None
    best_match_cuisines= process.extractOne(query, cuisine_list, score_cutoff=70)
    best_match_Continents = process.extractOne(query, Continent_list, score_cutoff=70)
    # best_match_serves = process.extractOne(query, serve_list, score_cutoff=70)
    best_match_Title = process.extractOne(query, Title_list, score_cutoff=70)
    # print(best_match_cuisines,best_match_Continents, best_match_serves ,best_match_Title)
    matches = [match for match in [best_match_cuisines, best_match_Continents, best_match_Title] if match is not None]
    # print(matches)
    if best_match_cuisines:
        match, _ , _ = best_match_cuisines
    elif best_match_Continents:
        match, _ , _ = best_match_Continents
    # elif best_match_Country_State:
    #     match, _ , _ = best_match_Country_State
    elif best_match_Title:
        match, _ , _ = best_match_Title
    else:
        pass

    if match:
        query_embedding=model.encode([query])
        k=len(chunks)
        D,I = faiss_index.search(query_embedding, k=min(len(chunks), k))
        relevant_bios=[chunks[i] for i in I[0]]
        filtered_bios=[bio for bio in relevant_bios if match in bio]
        for bio in filtered_bios:
            print(bio)
            print("\n")
    else:
        query_embedding=model.encode([query])
        k=len(chunks)
        D,I = faiss_index.search(query_embedding, k=min(len(chunks), 5))
        relevant_bios=[chunks[i] for i in I[0]]
        filtered_bios=[bio for bio in relevant_bios]
        for bio in filtered_bios:
            print(bio)
            print("\n")

""" fuctions for nutrients search  """


def find_nutrients(query):

    nutrient_mapping = {
    "calories": ["calorie", "cal", "kcal", "calor", "cals", "kal"],
    "carbohydrateContent": ["carbs", "carbohydrate", "carb", "carbo", "crabs", "cabro", "cabrohydrate"],
    "cholesterolContent": ["cholesterol", "cholestrol", "cholesteral", "chol", "cholestral"],
    "fiberContent": ["fiber", "fibre", "fibber", "fyber", "fibar", "roughage", "crude fiber"],
    "proteinContent": ["protein", "protien", "proten", "proetin", "protine", "proteine", "protane"],
    "saturatedFatContent": ["saturated fat", "sat fat", "satfat", "saturated", "sat_fat"],
    "sodiumContent": ["sodium", "salt", "sodim", "sodum", "na", "sod", "NA"],
    "sugarContent": ["sugar", "sugr", "suggar", "sug", "sugr"],
    "fatContent": ["fat", "fats", "fatt", "fett", "fatty"],
    "unsaturatedFatContent": ["unsaturated fat", "unsat fat", "unsatfat", "unsaturated", "unsat_fat"]
}

    best_match, best_score, best_key = None, 0, None
    for nutrient, synonyms in nutrient_mapping.items():
        match = process.extractOne(query, synonyms)  # Fuzzy match
        if match and match[1] > best_score:
            best_match, best_score, best_key = match[0], match[1], nutrient
    if best_key:
        best_key=best_key

    match = re.search(r'\d+(\.\d+)?', query)
    if match:
        nutrient=float(match.group())
        if nutrient:
            if re.search(r'\b(less|atmost|max|maximum)\b', query):
                # print(query)
                # print("inside less")
                for key, recipe in data_base.items():
                    nutrients = recipe.get("nutrients", {})
                    if isinstance(nutrients, dict):
                        for nutrient_key, nutrient_value in nutrients.items():
                            if nutrient_key==best_key:
                                match_value = re.search(r'\d+', str(nutrient_value))  # Convert to string to handle any type
                                if match_value and int(match_value.group()) <= nutrient:
                                    print(f"Found match in recipe: {recipe.get('title', 'Unknown Recipe')}")
                                    print(f"Nutrient: {nutrient_key} - {nutrient_value}")
                                    for key, value in recipe.items():
                                        print(f"{key}: {value}")
                                    print("\n")
            elif re.search(r'\b(more|atleast|min|minimum)\b', query):
                # print(query)

                for key, recipe in data_base.items():
                    nutrients = recipe.get("nutrients", {})
                    if isinstance(nutrients, dict):
                        for nutrient_key, nutrient_value in nutrients.items():
                            if nutrient_key==best_key:
                                match_value = re.search(r'\d+', str(nutrient_value))  # Convert to string to handle any type
                                if match_value and int(match_value.group()) >= nutrient:
                                    print(f"Found match in recipe: {recipe.get('title', 'Unknown Recipe')}")
                                    print(f"Nutrient: {nutrient_key} - {nutrient_value}")
                                    for key, value in recipe.items():
                                        print(f"{key}: {value}")
                                    print("\n")
            else:
                print("no such recepie")
        else:
            if re.search(r"\b(no|doesn't|dont|doesn’t|does'nt|does not|not|avoid|avoids|none|never|without|lack|lacks|free of|excluding|excluded|omit|omits|nothing|neither|nor|zero)\b", query):
                for key, recipe in data_base.items():
                    nutrients = recipe.get("nutrients", {})
                    if isinstance(nutrients, dict):
                        for nutrient_key, nutrient_value in nutrients.items():
                            if nutrient_key==best_key:
                                match_value = re.search(r'\d+', str(nutrient_value))  # Convert to string to handle any type
                                if match_value and int(match_value.group()) == 0:
                                    print(f"Found match in recipe: {recipe.get('title', 'Unknown Recipe')}")
                                    print(f"Nutrient: {nutrient_key} - {nutrient_value}")
                                    for key, value in recipe.items():
                                        print(f"{key}: {value}")
                                    print("\n")
            else:
                # print("no such recepie")
                pass

    
""" End of the Nutrients functions  """

def find_ingrediants(query):
    best_match, best_score, best_key = None, 0, None
    
    matched_recepies=[]
    for ingridant in ingrediants_list:
        match = process.extractOne(query, ingridant)  # Fuzzy match
        if match and match[1] > best_score:
            best_match, best_score, best_key = match[0], match[1], ingridant
    if best_key:
        best_key=best_key
    # print(best_key)

    flattened_ingredients = " ".join(best_key).lower()
    keywords=[i for i in query.split()]
    matched_ingrediant=[keyword for keyword in keywords if keyword.lower() in flattened_ingredients]
    # print(matched_ingrediant)
    matched_ingrediant_prompt=" ".join(matched_ingrediant)
    print(matched_ingrediant_prompt)
    for key, value in data_base.items():
        ingredients = value.get("ingredients", {})
        ingrediants_lower=[ingridiant.lower() for ingridiant in ingredients]
        if any(matched_ingrediant_prompt in ingrediant for ingrediant in ingrediants_lower):
            # print(value)
            # matched_recepies.append((key, value))
            for key, value in value.items():
                 print(f"{key}: {value}")
            print("\n")
            

query="give me the recepies which made with 2 eggs"
query = query.lower().strip().replace('"', '')
grammer_words=remove_stop_words(query=query)
# print(grammer_words)
query=[words for words in query.split() if words not in grammer_words]
query=" ".join(query)
print(query)

nutrients_pattern = r'\b\d+(\.\d+)?\s*(kcal|mg|g)\b'
time_pattern = r'\b(time|mins|min)\b'
ratting_pattern = r'\b\d+\.\d+\b'
serves_pattern=r'\b(sreves|people|serve|group of|individuals|portions|feed)\b'
ingrediant_pattern=r'\b(contains|has|have|made|contain)\b'



if re.search(nutrients_pattern, query):
    find_nutrients(query)
elif re.search(time_pattern, query, re.IGNORECASE):
    find_time(query)
elif re.search(ratting_pattern, query):
    find_ratting(query, data_base)
elif re.search(serves_pattern, query):
    find_serve(query)
elif re.search(ingrediant_pattern,query):
    find_ingrediants(query)
else:
    find_relevance(query)

recepies made 2 eggs
2 eggs
Contient: North America
Country_State: US
cuisine: Southern Recipes
title: Best Fried Green Tomatoes
URL: https://www.allrecipes.com/recipe/16760/best-fried-green-tomatoes/
rating: 4.7
total_time: 20
prep_time: 5
cook_time: 15
description: Fried green tomatoes are a quick and easy way to use up green tomatoes and make a wonderful late summer treat.
ingredients: ['4 large green tomatoes', '2 eggs', '0.5 cup milk', '1 cup all-purpose flour', '0.5 cup cornmeal', '0.5 cup bread crumbs', '2 teaspoons coarse kosher salt', '0.25 teaspoon ground black pepper', '1 quart vegetable oil for frying']
instructions: ['Slice tomatoes 1/2 inch thick. Discard the ends.', 'Whisk eggs and milk together in a medium-size bowl. Scoop flour onto a plate. Mix cornmeal, bread crumbs and salt and pepper on another plate. Dip tomatoes into flour to coat. Then dip the tomatoes into milk and egg mixture. Dredge in breadcrumbs to completely coat.', 'In a large skillet, pour vegetable oil 

In [7]:
import re

query = "give me the recepies which takes 5 mins"

time_pattern = r'\b\d+(\.\d+)?\s*(time|mins|min)\b|\b(time|mins|min)\s*\d+(\.\d+)?\b'

if re.search(time_pattern, query, re.IGNORECASE):
    print("Time pattern matched!")
else:
    print("No match found.")

Time pattern matched!


In [None]:
items = ["hello", "123kcal", "https://www.example.com", "google", "45"]

url_pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'

for item in items:
    # Check if item is a URL
    if re.match(url_pattern, item):
        print(f"URL detected: {item}")
    elif item.isalpha():
        print(f"Alphabetic: {item}")
    elif item.isnumeric():
        print(f"Numeric: {item}")
    else:
        print(f"Other: {item}")


# Negation detection

# AND OR COMPREHENSIONS

In [None]:
from rapidfuzz import process

filtered_query = "Adil Eli John"
name_list = ["Adam Lee", "Adil Eli", "John Doe", "Michael Smith"]

highest_match = None
highest_score = 0

for name in filtered_query.split():
    best_match = process.extractOne(name, name_list, score_cutoff=70)
    if best_match:
        match_name, score, _ = best_match  # Extract match name and score
        if score > highest_score:
            highest_score = score
            highest_match = match_name

print("Highest Match:", highest_match)



Highest Match: Adil Eli


In [278]:
query="Give me"
query_embedding=model.encode([query])
D, I = index.search(query_embedding, k=k)
context=I[0][0]
print(chunks[context])


Name: Adil Eli
Language: Uyghur
id: 6VTI8X6LL0MMPJCC
bio: Vivamus id faucibus velit, id posuere leo. Morbi vitae nisi lacinia, laoreet lorem nec, egestas orci. Suspendisse potenti.
version: 6.49


### to remove the irelevant data like give me only

In [None]:
query="Give me"
query_embedding=model.encode([query])
D, I = index.search(query_embedding, k=k)
context=I[0][0]
print(chunks[context])

Name: Adil Eli
Language: Uyghur
id: 6VTI8X6LL0MMPJCC
bio: Vivamus id faucibus velit, id posuere leo. Morbi vitae nisi lacinia, laoreet lorem nec, egestas orci. Suspendisse potenti.
version: 6.49


In [221]:
query = "Give me 6VTI8X6LL0MMPJCC"

# Filter out words that contain both letters and numbers
id_candidates = [word for word in query.split() if any(c.isdigit() for c in word) and any(c.isalpha() for c in word)]

print("Detected ID:", id_candidates)



Detected ID: ['6VTI8X6LL0MMPJCC']


In [216]:
text = "V59OF92YF627HFY0"
print(text.isalnum())

True


### to spell check

In [282]:
import json
from rapidfuzz import process
from symspellpy import SymSpell, Verbosity
from autocorrect import Speller


sym_spell=SymSpell()
# Load JSON data
with open("RawJSONData.json", "r") as file:
    data_base = json.load(file)

# Extract all names from the JSON
name_list = [entry["name"] for entry in data_base]
# flatten_name_list=[name.split() for name in name_list]
# flatten_name_list=[item for sublist in flatten_name_list for item in sublist]

def correct_name(name):
    best_match = process.extractOne(name, name_list, score_cutoff=70)  # 70% similarity threshold
    # print(best_match)
    return best_match[0] if best_match else name  # Return corrected name if found

def correct_spell(word):
    pass
    

# def remove_duplicate_phrases(text, phrase):
#     pattern = rf'\b{re.escape(phrase)}\b'  # Ensure exact match using word boundaries
#     matches = re.finditer(pattern, text)  

#     first_occurrence = None
#     result = text

#     for match in matches:
#         if first_occurrence is None:
#             first_occurrence = match.start()  # Store the first occurrence
#         else:
#             result = result[:match.start()] + result[match.end():]  # Remove duplicate

#     return result



query = "Give me the detals on Adil Eli and having id INDKF28KRH34"
match = re.search(r'\b(?:whos name is|about|Details of|details on|name|named|on)\s+(\w+)', query, re.IGNORECASE)
if match:
    name = match.group(1).lower() 
    corrected_name = correct_name(name.capitalize())  
    corrected_query = re.sub(name, corrected_name, query, flags=re.IGNORECASE)
    print(corrected_query)
else:
    print("No match found")


Give me the detals on Adil Eli Eli and having id INDKF28KRH34


words without names

In [179]:
import nltk
from nltk.metrics.distance import jaccard_distance 
from nltk.util import ngrams


nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [196]:
from nltk.corpus import words
correct_word=words.words()
incorrect_words=['dtals', 'azmaing', 'intelliengt', 'preeti'] 

In [197]:
for word in incorrect_words:
    temp = [(jaccard_distance(set(ngrams(word, 2)), 
                              set(ngrams(w, 2))),w) 
            for w in correct_word if w[0]==word[0]] 
    print(sorted(temp, key = lambda val:val[0])[0][1])

dital
amazing
intelligent
pretire


In [213]:
from textblob import TextBlob
text=TextBlob("Give me the details on Pree Rajh and having id INDKF28KRH34")
print(text.correct())

Give me the details on Free Each and having id INDKF28KRH34


In [None]:
from autocorrect import Speller ###################################nice###################################
spell = Speller(lang='en')
spell("Give me the details on Pree Rajh and having id INDKF28KRH34")

'Give me the details on Free Raja and having id IND28KR34'

In [219]:
from spellchecker import SpellChecker
text = "Give me the detils on Pree Rajh "
spell = SpellChecker()
' '.join([spell.correction(word) for word in text.split()])

'Give me the details on free rash'

In [275]:
import spacy

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

def detect_grammar_words_and_names(text):
    """Detects grammar words (stop words) and names (PERSON entities) in text."""
    doc = nlp(text)

    # Extract names using Named Entity Recognition (NER)
    names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

    # Extract grammar-related words (stop words)
    grammar_words = [token.text for token in doc if token.is_stop]

    return {"Names": names, "Grammar Words": grammar_words}

# Example query
query = "details, preeti,  Afzal, Ghaffar, check, John, Doe, . Masala, Dosa, Food, andy, apple who likes"

# Detect grammar words and names
result = detect_grammar_words_and_names(query)

# Print results
print("Names detected:", result["Names"])
print("Grammar words detected:", result["Grammar Words"])


Names detected: ['John', 'andy']
Grammar words detected: ['who']


### here the extraction is done now to structure the output we will use llm

In [1]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

model_id='meta-llama/Llama-3.1-405B-FP8'
# text_generation_pipeline=pipeline(
#     "text-generation",
#     model=model_id,
#     model_kwargs={"torch_dtype":"auto"},
#     max_new_tokens=400, 
#     device=0
# )

# prompt = f"Answer the question based on the following context:\n\n{chunks[context]}\n\nQ: {query}"
# output = text_generation_pipeline(prompt, max_length=200)
# print(output[0]['generated_text'])

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

prompt = f"Answer the question based on the following context:\n\n{context}\n\nQ: {query}"
inputs = tokenizer(prompt, return_tensors="pt")
output = model.generate(**inputs, max_new_tokens=200)
print(tokenizer.decode(output[0], skip_special_tokens=True))

  from .autonotebook import tqdm as notebook_tqdm


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-3.1-405B-FP8.
401 Client Error. (Request ID: Root=1-67cfe67e-3e7bce112d53e26f674b6bef;5271cb9a-d48d-4ee1-8b94-c7060c49d2d3)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.1-405B-FP8/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-405B-FP8 is restricted. You must have access to it and be authenticated to access it. Please log in.