### Fuzzy Matching

In [2]:
import pandas as pd
import numpy as np
from rapidfuzz import fuzz, process
from tqdm.auto import tqdm
import time
from collections import defaultdict
import re

def preprocess_text(text):
    """Enhanced text preprocessing"""
    if pd.isna(text):
        return ""
    # Convert to string, lowercase, remove special chars, and extra whitespace
    text = str(text).lower().strip()
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove special characters
    return ' '.join(text.split())  # Normalize whitespace

def get_blocks(text, n=3):
    """Generate n-grams for blocking"""
    words = text.split()
    blocks = set()
    for i in range(len(words) - n + 1):
        blocks.add(' '.join(words[i:i+n]))
    return blocks

def find_fuzzy_matches_optimized(df, name_threshold=90, address_threshold=90, batch_size=1000):
    print("\nFinding fuzzy matches (optimized version)...")

    # Preprocess the data
    df = df.copy()
    df['name_clean'] = df['name'].apply(preprocess_text)
    df['address_clean'] = df['address'].apply(preprocess_text)

    # Create blocking keys
    print("Creating blocking keys...")
    df['name_blocks'] = df['name_clean'].apply(lambda x: get_blocks(x) if x else set())

    # Build block index
    block_index = defaultdict(list)
    for idx, blocks in enumerate(df['name_blocks']):
        for block in blocks:
            block_index[block].append(idx)

    # Find potential matches using blocking
    potential_matches = set()
    for block, indices in tqdm(block_index.items(), desc="Blocking"):
        if len(indices) > 1:  # Only consider blocks with potential matches
            for i in range(len(indices)):
                for j in range(i+1, len(indices)):
                    id1, id2 = indices[i], indices[j]
                    if id1 != id2:
                        potential_matches.add((min(id1, id2), max(id1, id2)))

    print(f"Found {len(potential_matches)} potential matches to evaluate")

    # Evaluate potential matches
    matches = []
    for id1, id2 in tqdm(potential_matches, desc="Evaluating matches"):
        name1 = df.at[id1, 'name_clean']
        name2 = df.at[id2, 'name_clean']
        addr1 = df.at[id1, 'address_clean']
        addr2 = df.at[id2, 'address_clean']

        # Skip if either name or address is empty
        if not name1 or not name2 or not addr1 or not addr2:
            continue

        # Calculate similarity scores using rapidfuzz
        name_sim = fuzz.token_sort_ratio(name1, name2)
        addr_sim = fuzz.token_sort_ratio(addr1, addr2)
        combined_score = (name_sim + addr_sim) / 2

        # Check if either similarity meets the threshold
        if name_sim >= name_threshold or addr_sim >= address_threshold:
            matches.append({
                'id1': id1,
                'id2': id2,
                'name1': df.at[id1, 'name'],
                'name2': df.at[id2, 'name'],
                'address1': df.at[id1, 'address'],
                'address2': df.at[id2, 'address'],
                'name_similarity': name_sim,
                'address_similarity': addr_sim,
                'combined_similarity': combined_score
            })

    # Convert to DataFrame
    if matches:
        matches_df = pd.DataFrame(matches)
        matches_df = matches_df.sort_values('combined_similarity', ascending=False)
        return matches_df
    return pd.DataFrame()

def main_optimized():
    # Input and output file paths
    input_file = 'cleaned_companies.csv'
    output_file = 'fuzzy_matches_optimized.csv'

    # Load the data
    try:
        print("Loading data...")
        df = pd.read_csv(input_file)
        print(f"Loaded {len(df)} records")
    except FileNotFoundError:
        print(f"Error: Could not find the input file: {input_file}")
        print("Please make sure the file exists in the current directory.")
        return

    # Set similarity thresholds (0-100)
    NAME_SIMILARITY_THRESHOLD = 50
    ADDRESS_SIMILARITY_THRESHOLD = 50

    # Find fuzzy matches
    start_time = time.time()
    matches_df = find_fuzzy_matches_optimized(
        df,
        name_threshold=NAME_SIMILARITY_THRESHOLD,
        address_threshold=ADDRESS_SIMILARITY_THRESHOLD
    )
    end_time = time.time()

    # Save results to CSV
    if not matches_df.empty:
        matches_df.to_csv(output_file, index=False)
        print(f"\nFound {len(matches_df)} potential matches")
        print(f"Results saved to: {output_file}")

        # Show some sample matches
        print("\nSample of top matches:")
        print(matches_df.head())
    else:
        print("\nNo matches found with the given thresholds.")

    print(f"\nTotal execution time: {end_time - start_time:.2f} seconds")

if __name__ == "__main__":
    main_optimized()

  from .autonotebook import tqdm as notebook_tqdm


Loading data...
Loaded 31829 records

Finding fuzzy matches (optimized version)...
Creating blocking keys...


Blocking: 100%|██████████| 25973/25973 [00:00<00:00, 936591.11it/s]


Found 40173 potential matches to evaluate


Evaluating matches: 100%|██████████| 40173/40173 [00:02<00:00, 18841.33it/s]



Found 37272 potential matches
Results saved to: fuzzy_matches_optimized.csv

Sample of top matches:
         id1    id2                               name1  \
32582  25506  31574          J. W. Wilson International   
27312   3755   6445                     Fine Line Media   
6908    1817   4478  Nytrog Investments Private Limited   
11126   3911   5956           Studioflex Media Concepts   
17998   3483   3487            Pest Control Company The   

                                    name2  \
32582            J W Wilson International   
27312                     Fine Line Media   
6908   Nytrog Investments Private Limited   
11126           Studioflex Media Concepts   
17998            The Pest Control Company   

                                                address1  \
32582               145a Galloway Road, Norton, Zimbabwe   
27312  Address: 1 Adylinn Road Marlborough , Harare, ...   
6908   Address: 3 Hermes Road Southerton Harare, Zimb...   
11126  Address: Morgan House, 27 