In [10]:
import csv
import json
from fuzzywuzzy import fuzz

def fuzzy_match_name_with_tolerance(name, name_list, tolerance=5):
    cleaned_name = name.lower().replace(" ", "").replace(".","")
    max_score = -1
    matched_name = None
    for candidate_name in name_list:
        cleaned_candidate_name = candidate_name.lower().replace(" ", "").replace(".","")
        score = fuzz.ratio(cleaned_name, cleaned_candidate_name)
        len_sum = len(cleaned_name) + len(cleaned_candidate_name)
        levenshtein_distance = len_sum - score * len_sum / 100
        if levenshtein_distance <= tolerance:
            if score >= max_score:
                max_score = score
                matched_name = candidate_name
            
    # If no matches are found, process name parts
    if matched_name is None:
        sub_tolerance = 3
        name = name.replace(".","")
        name_parts = sorted(name.split(), key=len, reverse=True)  # Split and sort by length
        filtered_name_parts = [part for part in name_parts if len(part) > 2]  # Filter parts greater than 3 characters
        for part in filtered_name_parts:
            part_cleaned = part.lower().replace(" ", "").replace(".","")
            for candidate_name in name_list:
                cleaned_candidate_name = candidate_name.lower().replace(" ", "").replace(".","")
                
                best_score = 0
                best_distance = float('inf')

                # Sliding window to check every possible substring of candidate name
                for start in range(len(cleaned_candidate_name) - len(part_cleaned) + 1):
                    end = start + len(part_cleaned)
                    candidate_substring = cleaned_candidate_name[start:end]
                    
                    # Calculate fuzzy score and distance for the substring
                    score = fuzz.ratio(part_cleaned, candidate_substring)
                    len_sum = len(part_cleaned) + len(candidate_substring)
                    levenshtein_distance = len_sum - score * len_sum / 100
                    
                    # Update the best score and distance found so far
                    if score > best_score or (score == best_score and levenshtein_distance < best_distance):
                        best_score = score
                        best_distance = levenshtein_distance
                
                # If the best score and distance within tolerance, accept it as a match
                if best_distance <= sub_tolerance:
                    matched_name = candidate_name
            if matched_name:
                break
        
    return matched_name

In [13]:
name_list = ['आदित्य राज (डेविड भैया)',	'रमेश सिंह वकील','श्याम बिहारी जायसवाल',	'अयोध्या प्रसाद' ,'अरुणा पनिका'	,'ओमप्रकाश अहिरवार',	'महेश प्रसाद']

# name = 'रमेश सिंह वकी'
name = 'जायसवाल'

match = fuzzy_match_name_with_tolerance(name, name_list)
print(match)

श्याम बिहारी जायसवाल
