In [87]:
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from pymilvus import Collection, connections, FieldSchema, CollectionSchema, DataType
from dotenv import load_dotenv
import os
from bs4 import BeautifulSoup
import random
import json
from typing import List
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
from metaphone import doublemetaphone
from pymilvus import AnnSearchRequest
from pymilvus import WeightedRanker
import pandas as pd

load_dotenv()

# Connect to Zilliz Cloud
connections.connect(
    alias="default",
    uri=os.getenv("ZILLIZ_URI"),
    token=os.getenv("ZILLIZ_TOKEN")    
)


In [88]:
collection_name = "All_Words_Count_List"
collection=''
try:
    collection = Collection(name=collection_name)  
    collection.load()
    print("Collection exists.")
except Exception as e:
    collection = Collection(name=collection_name, schema=schema) 
    print("Collection created.")


if connections.has_connection("default"):
    print("Connection successful!")
else:
    print("Failed to connect.")

Collection exists.
Connection successful!


In [89]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [31]:
def get_metaphone(name):
    return doublemetaphone(name)[0]

In [None]:
# Create ANN search request 1 for filmVector
# query_filmVector = [[0.8896863042430693, 0.370613100114602, 0.23779315077113428, 0.38227915951132996, 0.5997064603128835]]
nameVector=[model.encode("NEELI GAGAN SAMACHAR TIMES").tolist()]

search_param_1 = {
    "data": nameVector, # Query vector
    "anns_field": "vector_of_name", # Vector field name
    "param": {
        "metric_type": "COSINE", # This parameter value must be identical to the one used in the collection schema
        "params": {"nprobe": 384}
    },
    "limit": 200, # Number of search results to return in this AnnSearchRequest,
}
request_1 = AnnSearchRequest(**search_param_1)

# Create ANN search request 2 for posterVector
# query_posterVector = [[0.02550758562349764, 0.006085637357292062, 0.5325251250159071, 0.7676432650114147, 0.5521074424751443]]
metaphoneVector=[model.encode(get_metaphone('NEELI GAGAN SAMACHAR TIMES')).tolist()]
search_param_2 = {
    "data": metaphoneVector, # Query vector
    "anns_field": "vector_of_metaphone", # Vector field name
    "param": {
        "metric_type": "COSINE", # This parameter value must be identical to the one used in the collection schema
        "params": {"nprobe": 384}
    },
    "limit": 200, # Number of search results to return in this AnnSearchRequest
}
request_2 = AnnSearchRequest(**search_param_2)
print(request_1)
print(request_2)

# Store these two requests as a list in `reqs`
reqs = [request_1, request_2]

{'anns_field': 'vector_of_name', 'param': {'metric_type': 'COSINE', 'params': {'nprobe': 384}}, 'limit': 200, 'expr': None}
{'anns_field': 'vector_of_metaphone', 'param': {'metric_type': 'COSINE', 'params': {'nprobe': 384}}, 'limit': 200, 'expr': None}


In [33]:
rerank = WeightedRanker(0.8, 0.2)

In [41]:
results = collection.hybrid_search(
    reqs, # List of AnnSearchRequests created in step 1
    rerank, # Reranking strategy specified in step 2
    limit=200, # Number of final search results to return,
    output_fields=["Metaphone_Name","Title_Code","Title_Name"]
)

In [43]:
processed_results = []
for result in results[0]:
    processed_results.append({
        "distance": result.distance,
        "Metaphone_Name": result.entity.get("Metaphone_Name"),
        "Title_Code": result.entity.get("Title_Code"),
        "Title_Name": result.entity.get("Title_Name"),
    })

In [47]:
hybrid_search_results=pd.DataFrame(processed_results)

In [48]:
hybrid_search_results

Unnamed: 0,distance,Metaphone_Name,Title_Code,Title_Name
0,0.999999,ANSXTJTTMS,UPHIN30203,ANUSUCHIT JATI TIMES
1,0.866217,JKRTTMS,JKENG00746,JAGRATI TIMES
2,0.833722,ANXTMS,BIHHIN11744,ANIWESH TIMES
3,0.830987,JSRTTMS,DELURD02896,JASARAT TIMES
4,0.830987,JSRTTMS,DELBIL06518,JASARAT TIMES
...,...,...,...,...
195,0.165786,TSTJRTMS,MPHIN34395,DASTGEER TIMES
196,0.165721,JTXRTMS,RAJHIN27392,JODHESHWAR TIMES
197,0.165661,MJTTMS,UPHIN47530,MAJEET TIMES
198,0.165532,RXTRLKMTTMS,UPHIN49259,RASHTRIYA LOKMAT TIMES


In [69]:
def create_hybrid_results(name=0.8,meta=0.2,title="INDIAN"):
    processed_results = []
    nameVector=[model.encode(title).tolist()]
    metaphoneVector=[model.encode(get_metaphone(title)).tolist()]
    search_param_1 = {
    "data": nameVector, 
    "anns_field": "vector_of_name", 
    "param": {
        "metric_type": "COSINE", 
        "params": {"nprobe": 384}
    },
    "limit": 200,
    }
    search_param_2 = {
    "data": metaphoneVector, 
    "anns_field": "vector_of_metaphone", 
    "param": {
        "metric_type": "COSINE",
        "params": {"nprobe": 384}
    },
    "limit": 200,
    }
    reqs = [AnnSearchRequest(**search_param_1), AnnSearchRequest(**search_param_2)]
    rerank = WeightedRanker(name, meta)
    results = collection.hybrid_search(
    reqs,
    rerank,
    limit=200,
    output_fields=["Metaphone_Name","Title_Name",'Count']
    )
    for result in results[0]:
        processed_results.append({
            "distance": result.distance,
            "Metaphone_Name": result.entity.get("Metaphone_Name"),
            "Title_Name": result.entity.get("Title_Name"),
            "Count":result.entity.get("Count")
        })
    df=pd.DataFrame(processed_results)
    df=df.sort_values(by=['distance',"Count"],ascending=False)[:50]
    return df.loc[df['Count']>100]
    

In [None]:
hybrid_search_results=create_hybrid_results(0.8,0.2,'AND')

In [86]:
entered_name="PERIN THE INDIAN"
for name in entered_name.split():
    result=create_hybrid_results(0.8,0.2,name)
    result=result.loc[result['distance']>0.80]
    if(result.shape[0]>0):
        print(f"the word {name} or similar to it matches with {((sum(result['Count'])*100)/10000)} of total names")
    # print(result)

the word THE or similar to it matches with 10.78 of total names
the word INDIAN or similar to it matches with 39.63 of total names


In [80]:
merged_df

Unnamed: 0,distance,Metaphone_Name,Title_Name,Count
0,0.999999,0,THE,1078.0
1,0.698457,N,NEW,132.0
2,0.594116,AL,ALL,128.0


In [23]:
word_counts=pd.read_csv('../dataFiles/word_counts.csv')

In [24]:
word_list=[]

In [25]:
for i in range(word_counts.shape[0]):
    name=word_counts.iloc[i]['Title_Name']
    count=word_counts.iloc[i]['Word_Count']
    word_list.append({
        'Title_Name':name,
        'Count':float(count),
        'Metaphone_Name':doublemetaphone(name)[0],
        'vector_of_name':model.encode(name).tolist(),
        'vector_of_metaphone':model.encode(doublemetaphone(name)[0]).tolist()
    })

In [26]:
output_json = {
    "rows": word_list
}
with open("word_count_two_vector.json", "w", encoding="utf-8") as json_file:
    json.dump(output_json, json_file, ensure_ascii=False, indent=4)