In [1]:
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from pymilvus import Collection, connections, FieldSchema, CollectionSchema, DataType
from dotenv import load_dotenv
import os
from bs4 import BeautifulSoup
import random
import json
from typing import List
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
from metaphone import doublemetaphone


load_dotenv()

# Connect to Zilliz Cloud
connections.connect(
    alias="default",
    uri=os.getenv("ZILLIZ_URI"),
    token=os.getenv("ZILLIZ_TOKEN")    
)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
collection_name = "Phonetic_2"
collection=''
try:
    collection = Collection(name=collection_name)  
    collection.load()
    print("Collection exists.")
except Exception as e:
    collection = Collection(name=collection_name, schema=schema) 
    print("Collection created.")


if connections.has_connection("default"):
    print("Connection successful!")
else:
    print("Failed to connect.")

Collection exists.
Connection successful!


In [3]:
def calculate_similarity(query_vector, stored_vector):
    return 1 - cosine(query_vector, stored_vector)

def get_metaphone(name):
    return doublemetaphone(name)[0]

In [4]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [65]:
import pandas as pd

In [137]:
# by this approach we will get the vector based on the Title_Name and we will find thode vector only and for this I have created a collection Simple_Embeddings

# model = SentenceTransformer('all-MiniLM-L6-v2')
# query_metaphone = get_metaphone("SAMPURNA JAGRAN")
results=[]
all_data=[]
lower_bound=0.6 #radius
upper_bound=0.8 #range_filter
query_vector = model.encode("ANUSUCHIT TIMES").tolist()
iterator=collection.search_iterator(
    data=[query_vector],
    anns_field="vector",
    param={"metric_type": "COSINE", "params": {"nprobe": 384,'range_filter':0.7}},
    limit=500,
    # expr=f"Metaphone_Name=='{query_metaphone}'",
    output_fields=["Metaphone_Name","Title_Code","Title_Name"]
)
print(iterator)
while True:
    result = iterator.next()
    if not result:
        iterator.close()
        break
    
    for hit in result:
        results.append(hit.to_dict())
        
print(len(results))
for i in range(0,len(results),2):
    all_data.append({
        "Title_Code":results[i]['entity']['Title_Code'],
        "Title_Name":results[i]['entity']['Title_Name'],
        "Metaphone_Name":results[i]['entity']['Metaphone_Name'],
        "distance":results[i]['distance']
    })
df=pd.DataFrame(all_data)


<pymilvus.orm.iterator.SearchIterator object at 0x0000017682A43310>
500


In [138]:
df

Unnamed: 0,Title_Code,Title_Name,Metaphone_Name,distance
0,UPHIN30203,ANUSUCHIT JATI TIMES,ANSXTJTTMS,0.820419
1,CHHHIN16004,ANUMODAN TIMES,ANMTNTMS,0.737099
2,BIHHIN11744,ANIWESH TIMES,ANXTMS,0.714466
3,UPHIN45726,ANURAG TIMES,ANRKTMS,0.712113
4,BIHHIN00188,ANURADHA TIMES,ANRTTMS,0.706303
...,...,...,...,...
245,MPHIN25234,INAAM TIMES,ANMTMS,0.509286
246,UPHIN13490,SAMUH TIMES,SMTMS,0.508974
247,UPHIN29818,SAMUH TIMES,SMTMS,0.508974
248,UPHIN00816,ANSAR TIMES,ANSRTMS,0.508708


In [110]:
df['Title_Name']

0      ANUSUCHIT JATI TIMES
1            ANUMODAN TIMES
2             ANIWESH TIMES
3              ANURAG TIMES
4            ANURADHA TIMES
               ...         
245             INAAM TIMES
246             SAMUH TIMES
247             SAMUH TIMES
248             ANSAR TIMES
249            BHELSA TIMES
Name: Title_Name, Length: 250, dtype: object

In [63]:
results = []

while True:
    result = iterator.next()
    if not result:
        iterator.close()
        break
    
    for hit in result:
        results.append(hit.to_dict())

print(results)

[{'id': 454083262685460103, 'distance': 0.9999986290931702, 'entity': {'Metaphone_Name': 'SMPRNJKRN', 'Title_Code': 'BIHHIN04876', 'Title_Name': 'SAMPURNA JAGRAN'}}, {'id': 454083262685460227, 'distance': 0.9999986290931702, 'entity': {'Metaphone_Name': 'SMPRNJKRN', 'Title_Code': 'SIKHIN00007', 'Title_Name': 'SAMPURNA JAGRAN'}}, {'id': 454083262685460098, 'distance': 0.9999986290931702, 'entity': {'Metaphone_Name': 'SMPRNJKRN', 'Title_Code': 'BIHHIN05227', 'Title_Name': 'SAMPURNA JAGRAN'}}, {'id': 454083262685460102, 'distance': 0.9999986290931702, 'entity': {'Metaphone_Name': 'SMPRNJKRN', 'Title_Code': 'BIHHIN05150', 'Title_Name': 'SAMPURNA JAGRAN'}}, {'id': 454083262685460163, 'distance': 0.8406250476837158, 'entity': {'Metaphone_Name': 'SMJTJKRN', 'Title_Code': 'MAHMAR18580', 'Title_Name': 'SAMAJWADI JAGRAN'}}, {'id': 454083262685460254, 'distance': 0.7880282402038574, 'entity': {'Metaphone_Name': 'JNPRJKRN', 'Title_Code': 'UPHIN05010', 'Title_Name': 'JAUNPUR JAGRAN'}}, {'id': 45408

In [60]:
print(results)

[{'id': 454083262683687856, 'distance': 0.7012470364570618, 'entity': {'Metaphone_Name': 'ANTNSMKR', 'Title_Code': 'DELURD00579', 'Title_Name': 'INDIAN SAMACHAR'}}, {'id': 454083262683690036, 'distance': 0.69939124584198, 'entity': {'Metaphone_Name': 'SMKRNT', 'Title_Code': 'UPHIN35849', 'Title_Name': 'SAMACHAR INDIA'}}, {'id': 454083262683689733, 'distance': 0.6977645754814148, 'entity': {'Metaphone_Name': 'SMRNT', 'Title_Code': 'UPHIN45242', 'Title_Name': 'SAMAR INDIA'}}, {'id': 454083262683689732, 'distance': 0.6977645754814148, 'entity': {'Metaphone_Name': 'SMRNT', 'Title_Code': 'UPHIN48348', 'Title_Name': 'SAMAR INDIA'}}, {'id': 454083262683690096, 'distance': 0.695911169052124, 'entity': {'Metaphone_Name': 'SMNJSNT', 'Title_Code': 'UPHIN45165', 'Title_Name': 'SAMANJASYA INDIA'}}, {'id': 454083262683689229, 'distance': 0.6795110106468201, 'entity': {'Metaphone_Name': 'SMTNNT', 'Title_Code': 'RAJHIN17513', 'Title_Name': 'SAMADHAN INDIA'}}, {'id': 454083262683689004, 'distance': 0.6

In [7]:
with open("vectorEmbeddings_Based_on_title.txt", "w", encoding="utf-8") as f:
    for i in range(200):
    # f.write(f"Title Code: {results[0][0].distance}\n")
    # f.write(f"Title Name: {results[0][i].entity}\n")
        f.write(f"{results[0][i]}\n")
    # f.write(f"Score: {result['score']}\n")
    # f.write("\n")

In [16]:
collection_name = "Two_Vectors"
collection=''
try:
    collection = Collection(name=collection_name)  
    collection.load()
    print("Collection exists.")
except Exception as e:
    collection = Collection(name=collection_name, schema=schema) 
    print("Collection created.")


if connections.has_connection("default"):
    print("Connection successful!")
else:
    print("Failed to connect.")

Collection exists.
Connection successful!


In [14]:
# by this approach we will get the vector based on the Metaphone_Name and we will find those vector only and for this I have created a collection Phonetic_Data
query_metaphone = get_metaphone("BRAJ KI AAWAZ JAGRAN")
query_vector = model.encode(query_metaphone).tolist()
results=collection.search(
    data=[query_vector],
    anns_field="vector",
    param={"metric_type": "COSINE", "params": {"nprobe": 384}},
    limit=200,
    # expr=f"Metaphone_Name=='{query_metaphone}'",
    output_fields=["Title_Name","NYSIIS_Name"]
)
print(results)

data: ['["id: 453901917307669182, distance: 0.9227463006973267, entity: {\'Title_Name\': \'PRAVASI JAGRAN\', \'NYSIIS_Name\': \'PRAVASI JAGRAN\'}", "id: 453901917307669186, distance: 0.9167972207069397, entity: {\'Title_Name\': \'PRAKHAR JAGRAN\', \'NYSIIS_Name\': \'PRACHAR JAGRAN\'}", "id: 453901917307669185, distance: 0.9167972207069397, entity: {\'Title_Name\': \'PRAKHAR JAGRAN\', \'NYSIIS_Name\': \'PRACHAR JAGRAN\'}", "id: 453901917307669150, distance: 0.9167199730873108, entity: {\'Title_Name\': \'BHARAT JAGRAN\', \'NYSIIS_Name\': \'BHARAT JAGRAN\'}", "id: 453901917307669295, distance: 0.9167199730873108, entity: {\'Title_Name\': \'PROD JAGRAN\', \'NYSIIS_Name\': \'PROD JAGRAN\'}", "id: 453901917307669235, distance: 0.9056485295295715, entity: {\'Title_Name\': \'BRAHMAN JAGRAN\', \'NYSIIS_Name\': \'BRAHMAN JAGRAN\'}", "id: 453901917307669201, distance: 0.8963201642036438, entity: {\'Title_Name\': \'PRADESH JAGRAN\', \'NYSIIS_Name\': \'PRADESH JAGRAN\'}", "id: 453901917307683437, d

In [38]:
# Encode both metaphone and title vectors
name="GRAMIN PARTIDIN"
query_metaphone_vector = model.encode(get_metaphone(name)).tolist()
query_title_vector = model.encode(name).tolist()

# Search on Metaphone vector
results_metaphone = collection.search(
    data=[query_metaphone_vector],
    anns_field="vector_of_metaphone",
    param={"metric_type": "COSINE", "params": {"nprobe": 384}},
    limit=200,
    output_fields=["Title_Name", "Metaphone_Name"]
)

# Search on Title_Name vector
results_title = collection.search(
    data=[query_title_vector],
    anns_field="vector_of_name",
    param={"metric_type": "COSINE", "params": {"nprobe": 384}},
    limit=200,
    output_fields=["Title_Name", "Metaphone_Name"]
)

# Combine and rank results (example combining scores)
combined_results = []

# for res_metaphone, res_title in zip(results_metaphone, results_title):
#     combined_score = 0.5 * res_metaphone.score + 0.5 * res_title.score  # Weighted average
#     combined_results.append({
#         "Title_Name": res_metaphone.entity.Title_Name,  # Prefer Metaphone's title
#         "Metaphone_Name": res_metaphone.entity.NYSIIS_Name,
#         "Combined_Score": combined_score
#     })

for i in range(50):
    print(f"Name->{name}\n")
    print(f"Metaphone->{get_metaphone(name)}\n")
    print(f"{results_metaphone[0][i]}\n")
    print(f"{results_title[0][i]}\n\n\n")

# Sort by combined score
combined_results = sorted(combined_results, key=lambda x: x["Combined_Score"], reverse=True)

print("Combined Results:")
print(combined_results)


Name->GRAMIN PARTIDIN

Metaphone->KRMNPRTTN

id: 454083262673553673, distance: 1.0000001192092896, entity: {'Title_Name': 'GRAMIN PRATIDIN', 'Metaphone_Name': 'KRMNPRTTN'}

id: 454083262671854693, distance: 0.6880871653556824, entity: {'Title_Name': 'GRAMIN TODAY', 'Metaphone_Name': 'KRMNTT'}



Name->GRAMIN PARTIDIN

Metaphone->KRMNPRTTN

id: 454083262671857444, distance: 0.8382698893547058, entity: {'Title_Name': 'GREAT INDIA YOUNGISTAN', 'Metaphone_Name': 'KRTNTNJSTN'}

id: 454083262673553673, distance: 0.635102391242981, entity: {'Title_Name': 'GRAMIN PRATIDIN', 'Metaphone_Name': 'KRMNPRTTN'}



Name->GRAMIN PARTIDIN

Metaphone->KRMNPRTTN

id: 454083262671855696, distance: 0.8344125151634216, entity: {'Title_Name': 'CRIME REPORTER TODAY', 'Metaphone_Name': 'KRMRPRTRTT'}

id: 454083262671849163, distance: 0.6165292263031006, entity: {'Title_Name': 'GRAMIN TIMES', 'Metaphone_Name': 'KRMNTMS'}



Name->GRAMIN PARTIDIN

Metaphone->KRMNPRTTN

id: 454083262671854693, distance: 0.81191456

In [39]:
results = collection.query(
            # data=[query_vector],
            # anns_field="vector",
            # param=search_params,
            limit=1500,
            expr=f"",
            output_fields=["Title_Name"]
        )

In [37]:
results[7]['Title_Name']

'SAMPURNA JAGRAN'

In [15]:
with open("vector_Embedings_Based_on_Metaphone.txt", "w", encoding="utf-8") as f:
    for i in range(200):
    # f.write(f"Title Code: {results[0][0].distance}\n")
    # f.write(f"Title Name: {results[0][i].entity}\n")
        f.write(f"{results[0][i]}\n")
        # f.write(f"{results[0][i].entity.Title_Name}\n")
        # f.write(f"{results[i]['Title_Name']}\n")
    # f.write(f"Score: {result['score']}\n")
    # f.write("\n")

In [30]:
help(results)

Help on SearchResult in module pymilvus.client.abstract object:

class SearchResult(builtins.list)
 |  SearchResult(res: schema_pb2.SearchResultData, round_decimal: Optional[int] = None, status: Optional[common_pb2.Status] = None)
 |  
 |  nq results: List[Hits]
 |  
 |  Method resolution order:
 |      SearchResult
 |      builtins.list
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, res: schema_pb2.SearchResultData, round_decimal: Optional[int] = None, status: Optional[common_pb2.Status] = None)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __iter__(self) -> pymilvus.client.abstract.SequenceIterator
 |      Implement iter(self).
 |  
 |  __repr__ = __str__(self) -> str
 |  
 |  __str__(self) -> str
 |      Only print at most 10 query results
 |  
 |  get_fields_by_range(self, start: int, end: int, all_fields_data: List[schema_pb2.FieldData]) -> Dict[str, Tuple[List[Any], schema_pb2.FieldData]]
 |  
 |  -----------------

In [1]:
!pip install pyphonetics



In [2]:
from pyphonetics import RefinedSoundex
rs=RefinedSoundex()
rs.distance("VIDHYA JAaGRAN","VIDISHA JAGRAN")

2

In [None]:
def load_dictionary(file_path):
    with open(file_path, 'r') as file:
        return [line.strip() for line in file]

def wagner_fischer(s1, s2):
    len_s1, len_s2 = len(s1), len(s2)
    if len_s1 > len_s2:
        s1, s2 = s2, s1
        len_s1, len_s2 = len_s2, len_s1

    current_row = range(len_s1 + 1)
    for i in range(1, len_s2 + 1):
        previous_row, current_row = current_row, [i] + [0] * len_s1
        for j in range(1, len_s1 + 1):
            add, delete, change = previous_row[j] + 1, current_row[j-1] + 1, previous_row[j-1]
            if s1[j-1] != s2[i-1]:
                change += 1
            current_row[j] = min(add, delete, change)

    return current_row[len_s1]

def spell_check(word, dictionary):
    suggestions = []

    for correct_word in dictionary:
        distance = wagner_fischer(word, correct_word)
        suggestions.append((correct_word, distance))

    suggestions.sort(key=lambda x: x[1])
    return suggestions[:100]

# Example Usage
dictionary = load_dictionary("vector_Embedings_Based_on_Metaphone.txt")
misspelled_word = "ANUSUCHIT JATI TIMES"
suggestions = spell_check(misspelled_word, df['Title_Name'])
print(f"Top 10 suggestions for '{misspelled_word}':")
for word, distance in suggestions:
    print(f"{word} (Distance: {distance})")

Top 10 suggestions for 'SAMPURNA JAGRAN':
id: 454083262670883499, distance: 0.5867906808853149, entity: {'Title_Name': 'CRN INDIA', 'NYSIIS_Name': 'CRNAND'} (Distance: 107)
id: 453901917307669206, distance: 0.8294041752815247, entity: {'Title_Name': 'PAL JAGRAN', 'NYSIIS_Name': 'PAL JAGRAN'} (Distance: 108)
id: 453901917307669159, distance: 0.7826033234596252, entity: {'Title_Name': 'JAN JAGRAN', 'NYSIIS_Name': 'JAN JAGRAN'} (Distance: 108)
id: 453901917307669123, distance: 0.7826033234596252, entity: {'Title_Name': 'JAN JAGRAN', 'NYSIIS_Name': 'JAN JAGRAN'} (Distance: 108)
id: 453901917307669224, distance: 0.7826033234596252, entity: {'Title_Name': 'JAN JAGRAN', 'NYSIIS_Name': 'JAN JAGRAN'} (Distance: 108)
id: 453901917307669272, distance: 0.786681592464447, entity: {'Title_Name': 'KHAS JAGRAN', 'NYSIIS_Name': 'CHAS JAGRAN'} (Distance: 109)
id: 453901917307669278, distance: 0.786681592464447, entity: {'Title_Name': 'KHAS JAGRAN', 'NYSIIS_Name': 'CHAS JAGRAN'} (Distance: 109)
id: 45408

In [5]:
!pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.26.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.1 (from python-Levenshtein)
  Downloading levenshtein-0.26.1-cp310-cp310-win_amd64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.1->python-Levenshtein)
  Downloading rapidfuzz-3.10.1-cp310-cp310-win_amd64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.26.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.1-cp310-cp310-win_amd64.whl (98 kB)
Downloading rapidfuzz-3.10.1-cp310-cp310-win_amd64.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ------------- -------------------------- 0.5/1.6 MB 3.4 MB/s eta 0:00:01
   ---------------------------------------- 1.6/1.6 MB 3.9 MB/s eta 0:00:00
Installing collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.26.1 python-Levenshtein-0.26.1 rapidfuzz-3.10.1


In [6]:
from fuzzywuzzy import fuzz
fuzz.ratio("Book","Books")

89

In [12]:
Str1 = "VIDHYA JAGRAN"
Str2 = "VISHWA JAGRAN"
print(fuzz.token_sort_ratio(Str1,Str2))

85
