In [1]:
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from pymilvus import Collection, connections, FieldSchema, CollectionSchema, DataType
from dotenv import load_dotenv
import os
from bs4 import BeautifulSoup
import random
import json
from typing import List
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
from metaphone import doublemetaphone


load_dotenv()

# Connect to Zilliz Cloud
connections.connect(
    alias="default",
    uri=os.getenv("ZILLIZ_URI"),
    token=os.getenv("ZILLIZ_TOKEN")    
)


  from .autonotebook import tqdm as notebook_tqdm


In [30]:
collection_name = "Simple_Embeddings"
collection=''
try:
    collection = Collection(name=collection_name)  
    collection.load()
    print("Collection exists.")
except Exception as e:
    collection = Collection(name=collection_name, schema=schema) 
    print("Collection created.")


if connections.has_connection("default"):
    print("Connection successful!")
else:
    print("Failed to connect.")

Collection exists.
Connection successful!


In [31]:
def calculate_similarity(query_vector, stored_vector):
    return 1 - cosine(query_vector, stored_vector)

def get_metaphone(name):
    return doublemetaphone(name)[0]

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# by this approach we will get the vector based on the Title_Name and we will find thode vector only and for this I have created a collection Simple_Embeddings

model = SentenceTransformer('all-MiniLM-L6-v2')
# query_metaphone = get_metaphone("SAMPURNA JAGRAN")
query_vector = model.encode("SAMPURNA JAGRAN").tolist()
results=collection.search(
    data=[query_vector],
    anns_field="vector",
    param={"metric_type": "COSINE", "params": {"nprobe": 384}},
    limit=10,
    # expr=f"Metaphone_Name=='{query_metaphone}'",
    output_fields=["Metaphone_Name","Title_Code","Title_Name"]
)
print(results)


data: ['["id: 454083262608239335, distance: 0.9999986886978149, entity: {\'Title_Code\': \'BIHHIN05150\', \'Title_Name\': \'SAMPURNA JAGRAN\', \'Metaphone_Name\': \'SMPRNJKRN\'}", "id: 454083262608239336, distance: 0.9999986886978149, entity: {\'Title_Code\': \'BIHHIN04876\', \'Title_Name\': \'SAMPURNA JAGRAN\', \'Metaphone_Name\': \'SMPRNJKRN\'}", "id: 454083262608239460, distance: 0.9999986886978149, entity: {\'Title_Code\': \'SIKHIN00007\', \'Title_Name\': \'SAMPURNA JAGRAN\', \'Metaphone_Name\': \'SMPRNJKRN\'}", "id: 454083262608239331, distance: 0.9999986886978149, entity: {\'Title_Code\': \'BIHHIN05227\', \'Title_Name\': \'SAMPURNA JAGRAN\', \'Metaphone_Name\': \'SMPRNJKRN\'}", "id: 454083262608239396, distance: 0.8406250476837158, entity: {\'Title_Code\': \'MAHMAR18580\', \'Title_Name\': \'SAMAJWADI JAGRAN\', \'Metaphone_Name\': \'SMJTJKRN\'}", "id: 454083262608239487, distance: 0.7880282402038574, entity: {\'Title_Code\': \'UPHIN05010\', \'Title_Name\': \'JAUNPUR JAGRAN\', \'Me

In [None]:
with open("vectorEmbeddings_Based_on_title.txt", "w", encoding="utf-8") as f:
    for i in range(200):
    # f.write(f"Title Code: {results[0][0].distance}\n")
    # f.write(f"Title Name: {results[0][i].entity}\n")
        f.write(f"{results[0][i]}\n")
    # f.write(f"Score: {result['score']}\n")
    # f.write("\n")

In [None]:
collection_name = "Phonetic_Data"
collection=''
try:
    collection = Collection(name=collection_name)  
    collection.load()
    print("Collection exists.")
except Exception as e:
    collection = Collection(name=collection_name, schema=schema) 
    print("Collection created.")


if connections.has_connection("default"):
    print("Connection successful!")
else:
    print("Failed to connect.")

In [None]:
# by this approach we will get the vector based on the Metaphone_Name and we will find those vector only and for this I have created a collection Phonetic_Data
query_metaphone = get_metaphone("SAMPURNA JAGRAN")
query_vector = model.encode(query_metaphone).tolist()
results=collection.search(
    data=[query_vector],
    anns_field="vector",
    param={"metric_type": "COSINE", "params": {"nprobe": 384}},
    limit=10,
    # expr=f"Metaphone_Name=='{query_metaphone}'",
    output_fields=["Metaphone_Name","Title_Code","Title_Name"]
)
print(results)

In [None]:
with open("vector_Embedings_Based_on_Metaphone.txt", "w", encoding="utf-8") as f:
    for i in range(200):
    # f.write(f"Title Code: {results[0][0].distance}\n")
    # f.write(f"Title Name: {results[0][i].entity}\n")
        f.write(f"{results[0][i]}\n")
    # f.write(f"Score: {result['score']}\n")
    # f.write("\n")

id: 453901917307669224, distance: 0.7793334722518921, entity: {'Metaphone_Name': 'JNJKRN', 'Title_Code': 'PUNPUN03786', 'Title_Name': 'JAN JAGRAN'}

In [11]:
help(collection.search)

Help on method search in module pymilvus.orm.collection:

search(data: Union[List, Iterable[Union[Dict[int, float], Iterable[Tuple[int, float]]]], ForwardRef('csc_array'), ForwardRef('coo_array'), ForwardRef('bsr_array'), ForwardRef('dia_array'), ForwardRef('dok_array'), ForwardRef('lil_array'), ForwardRef('csr_array'), ForwardRef('spmatrix')], anns_field: str, param: Dict, limit: int, expr: Optional[str] = None, partition_names: Optional[List[str]] = None, output_fields: Optional[List[str]] = None, timeout: Optional[float] = None, round_decimal: int = -1, **kwargs) method of pymilvus.orm.collection.Collection instance
    Conducts a vector similarity search with an optional boolean expression as filter.
    
    Args:
        data (``List[List[float]]/sparse types``): The vectors of search data.
            the length of data is number of query (nq),
            and the dim of every vector in data must be equal to the vector field of collection.
        anns_field (``str``): The name 

In [1]:
!pip install pyphonetics



In [2]:
from pyphonetics import RefinedSoundex
rs=RefinedSoundex()
rs.distance("VIDHYA JAaGRAN","VIDISHA JAGRAN")

2