## Fuzzy Name conversion of Names in Police Records


#### 1.  Advanced Fuzzy Matching and Phonetic Search for Accurate Name Retrieval
#### 2. Standardized Transliteration and Error Correction Mechanisms
#### 3. Script Interoperability and Best Practices for Data Entry


In [None]:
!pip install python-Levenshtein
!pip install fuzzywuzzy

In [48]:
import os ,sys
import numpy as np

In [49]:
import nltk

In [50]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [86]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [87]:
import Levenshtein

In [51]:
df  = pd.read_csv('dd.csv')

In [52]:
df.head()

Unnamed: 0,Name
0,Anjal
1,Anrali
2,Anjali
3,jrjun
4,Anjali


In [53]:
df.drop_duplicates(inplace=True)

In [54]:
df['Name'].unique()

array(['Anjal', 'Anrali', 'Anjali', ..., 'Anfali', 'zavita', 'Rajegsh'],
      dtype=object)

In [56]:
TfidfVectorizer

sklearn.feature_extraction.text.TfidfVectorizer

In [57]:
def find_similar_names(names_list, target_name, threshold=0.2):
    vectorizer = TfidfVectorizer()
    
    vectorizer.fit(names_list)
    
    vectors = vectorizer.transform(names_list).toarray()

    target_vector = vectorizer.transform([target_name]).toarray()

    cosine_similarities = cosine_similarity(target_vector, vectors).flatten()

    similar_names = [name for name, similarity in zip(names_list, cosine_similarities) if similarity >= threshold]

    return similar_names


In [58]:
df.shape

(1930, 1)

In [59]:
names = list(df['Name'])

In [60]:
names

['Anjal',
 'Anrali',
 'Anjali',
 'jrjun',
 'Surj',
 'Arjun',
 'Kavita',
 'Rajesh',
 'Ramesh',
 'Anjalhi',
 'Suraj',
 'Arjrun',
 'Suesh',
 'Ramsh',
 'Amit',
 'Suresh',
 'Rjesh',
 'Arjum',
 'kNeha',
 'Priya',
 'Awit',
 'Anjai',
 'Arjn',
 'Rajesn',
 'vmit',
 'Rajosh',
 'Ahrjun',
 'Suressh',
 'Sura',
 'Rajeh',
 'Sburesh',
 'Suresr',
 'Arun',
 'Ravesh',
 'Armit',
 'Neha',
 'Nea',
 'Rqajesh',
 'Priyra',
 'Ajali',
 'Sburaj',
 'Ragesh',
 'Priyga',
 'Amik',
 'eha',
 'Kavitg',
 'Kavitp',
 'samesh',
 'Sursh',
 'Kavta',
 'Prlya',
 'Suryaj',
 'Arjkn',
 'Amt',
 'Anjaoli',
 'Sureuh',
 'Rajrsh',
 'Suwesh',
 'Nha',
 'Ami',
 'Rajessh',
 'Kavqita',
 'nNeha',
 'mit',
 'Sukaj',
 'Kavith',
 'Priyya',
 'Aujali',
 'ravita',
 'Kavit',
 'Anjalo',
 'Rajevsh',
 'Ait',
 'uresh',
 'Priy',
 'Prriya',
 'Rjajesh',
 'Rmesh',
 'Priyo',
 'Anali',
 'sSuraj',
 'Kaita',
 'Artjun',
 'Ngeha',
 'Agit',
 'rjun',
 'Surxsh',
 'Ngha',
 'Rajesth',
 'tAmit',
 'Amidt',
 'Sraj',
 'Kmvita',
 'Rramesh',
 'Kavia',
 'Ramedh',
 'cKavita',


In [61]:

target = "suraj"
similar_names = find_similar_names(names, target, threshold=0.5)

print(f"Names similar to '{target}': {similar_names}")

Names similar to 'suraj': ['Suraj', 'suraj']


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [67]:
from fuzzywuzzy import process

def find_similar_names(names_list, target_name, threshold=10):
    similar_names = process.extract(target_name, names_list, limit=None, scorer=process.fuzz.token_set_ratio)
    
    return [name for name, score in similar_names if score >= threshold]

names = names
target = 'sooraj' 
similar_names = find_similar_names(names, target, threshold=10)

print(f"Names similar to '{target}': {similar_names}")

Names similar to 'sooraj': ['sooraj', 'Soraj', 'Souraj', 'Sraj', 'Suraj', 'Straj', 'Sfraj', 'ouraj', 'Sqraj', 'Syraj', 'Sxraj', 'Spraj', 'Seraj', 'Sjraj', 'Scraj', 'Sgraj', 'Ssraj', 'Skraj', 'Szraj', 'suraj', 'Sburaj', 'Suryaj', 'sSuraj', 'qSuraj', 'nSuraj', 'Suwraj', 'Surafj', 'Siuraj', 'Surapj', 'eSuraj', 'Sturaj', 'Sujraj', 'iSuraj', 'Suxraj', 'Surlaj', 'Surazj', 'tSuraj', 'Suiraj', 'Suuraj', 'Surkaj', 'Suvraj', 'Surahj', 'Suraqj', 'Suralj', 'vSuraj', 'Suraij', 'Surqaj', 'lSuraj', 'Sufraj', 'bSuraj', 'Surayj', 'Surauj', 'Svuraj', 'Surakj', 'Suhraj', 'Surbaj', 'Swuraj', 'wSuraj', 'fSuraj', 'Surasj', 'Sugraj', 'Surmaj', 'Surhaj', 'Suragj', 'cSuraj', 'Seuraj', 'Suriaj', 'Suracj', 'Sfuraj', 'aSuraj', 'Surarj', 'Susraj', 'ySuraj', 'Sudraj', 'hSuraj', 'Smuraj', 'Sukraj', 'Surpaj', 'Surgaj', 'Surawj', 'Suraoj', 'Squraj', 'gSuraj', 'Supraj', 'Suraaj', 'Surfaj', 'Sueraj', 'Surabj', 'Surnaj', 'Suravj', 'Spuraj', 'Suzraj', 'Sjuraj', 'Sauraj', 'Sruraj', 'Ssuraj', 'Surwaj', 'Sucraj', 'Suyraj', '

In [82]:


def find_top_similar_names_levenshtein(names_list, target_name, top_n=10):
    name_distances = [(name, Levenshtein.distance(name, target_name)) for name in names_list]
    
    name_distances.sort(key=lambda x: x[1])
    
    top_matches = [name for name, distance in name_distances[:top_n]]
    
    return top_matches

In [85]:
names = names
target = 'Svuraj'  
similar_names = find_top_similar_names_levenshtein(names, target)

print(f"Names similar to '{target}': {similar_names}")

Names similar to 'Svuraj': ['Svuraj', 'Suraj', 'Sburaj', 'Siuraj', 'Sturaj', 'Suuraj', 'Swuraj', 'vuraj', 'Seuraj', 'Sfuraj']


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting python-Levenshtein
  Downloading python_Levenshtein-0.26.0-py3-none-any.whl (9.4 kB)
Collecting Levenshtein==0.26.0 (from python-Levenshtein)
  Downloading levenshtein-0.26.0-cp311-cp311-win_amd64.whl (98 kB)
                                              0.0/98.5 kB ? eta -:--:--
     ----                                     10.2/98.5 kB ? eta -:--:--
     ---------------                        41.0/98.5 kB 487.6 kB/s eta 0:00:01
     ---------------                        41.0/98.5 kB 487.6 kB/s eta 0:00:01
     ---------------                        41.0/98.5 kB 487.6 kB/s eta 0:00:01
     ---------------                        41.0/98.5 kB 487.6 kB/s eta 0:00:01
     ---------------                        41.0/98.5 kB 487.6 kB/s eta 0:00:01
     -------------------------------------- 98.5/98.5 kB 314.1 kB/s eta 0:00:00
Installing collected packages: Levenshtein, python-Levenshtein
Successfully instal

In [90]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting sentencepiece (from transformers)
  Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl (991 kB)
                                              0.0/991.5 kB ? eta -:--:--
                                              10.2/991.5 kB ? eta -:--:--
                                              10.2/991.5 kB ? eta -:--:--
     -                                     30.7/991.5 kB 220.2 kB/s eta 0:00:05
     -                                     41.0/991.5 kB 196.9 kB/s eta 0:00:05
     -                                     41.0/991.5 kB 196.9 kB/s eta 0:00:05
     --                                    61.4/991.5 kB 218.8 kB/s eta 0:00:05
     --                                    61.4/991.5 kB 218.8 kB/s eta 0:00:05
     --                                    71.7/991.5 kB 187.3 kB/s eta 0:00:05
     ---                                   92.2/991.5 kB 201.8 kB/s eta 0:00:05
     ---                        

In [None]:
# Function to translate text from English to Hindi
def translate(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    # Perform translation and decode the output
    translated = model.generate(**inputs)
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    return translated_text

# Example usage
english_text = "Lets start about SIH, guys"
hindi_translation = translate(english_text)
print("Hindi Translation:", hindi_translation)