In [0]:
%pip install datasketch python-Levenshtein

In [0]:
import dlt
from datasketch import MinHash, MinHashLSH
from Levenshtein import ratio as levenshtein_ratio
import pandas as pd
import re
from collections import defaultdict

In [0]:
def preprocess_text(text):
    text = str(text).strip().lower()
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove special characters
    return text

COUNTRY_MAP = {
    "ch": "switzerland",
    "suisse": "switzerland",
    "schweiz": "switzerland",
    "li": "liechtenstein",
    "us": "united states",
    "gb": "united kingdom",
    "de": "germany"
}

def preprocess_country_names(country):
    country = preprocess_text(country)
    return COUNTRY_MAP.get(country, country)

def create_weighted_minhash(row, columns, weights):
    combined_text = []
    for col, weight in zip(columns, weights):
        if col in row:
            normalized_value = str(row[col])
            combined_text.extend([normalized_value] * weight)
    combined_text = " ".join(combined_text)
    minhash = MinHash(num_perm=128)
    for word in combined_text.split():
        minhash.update(word.encode("utf8"))
    return minhash

def weighted_clustering(df, columns, weights, threshold):
    lsh = MinHashLSH(threshold=threshold / 100, num_perm=128)
    clusters = defaultdict(list)
    minhashes = {}

    for idx, row in df.iterrows():
        minhash = create_weighted_minhash(row, columns, weights)
        minhashes[idx] = minhash
        lsh.insert(idx, minhash)

    visited = set()
    cluster_assignments = [None] * len(df)
    driver_ids = [None] * len(df)
    link_scores = [None] * len(df)
    cluster_id = 1
    cluster_sizes = {}

    for idx in minhashes:
        if idx in visited:
            continue
        cluster = lsh.query(minhashes[idx])
        # Cluster is a list of indices
        for member_idx in cluster:
            visited.add(member_idx)
            cluster_assignments[member_idx] = cluster_id
            driver_ids[member_idx] = df.iloc[idx]['id']
            # LinkScore: Levenshtein similarity between main field and driver
            link_scores[member_idx] = levenshtein_ratio(
                str(df.iloc[idx][columns[0]]),
                str(df.iloc[member_idx][columns[0]])
            ) * 100
        cluster_sizes[cluster_id] = len(cluster)
        cluster_id += 1

    # Any records not assigned? (shouldn't happen, but just in case)
    for idx in range(len(df)):
        if cluster_assignments[idx] is None:
            cluster_assignments[idx] = cluster_id
            driver_ids[idx] = df.iloc[idx]['id']
            link_scores[idx] = 100.0
            cluster_sizes[cluster_id] = 1
            cluster_id += 1

    return cluster_assignments, driver_ids, cluster_sizes, link_scores

# ------------------- DLT TABLE FUNCTION -------------------

@dlt.table(
  name="dedup_output_Python_DLT"
)
def deduplication_fuzzy():
    # Replace below with your input table reference
    input_table = "my_database.my_schema.dedup_input"

    # Load the data as Spark DataFrame
    df_spark = spark.table(input_table)
    # If your data is already clean, skip normalization below
    # Rename columns to lower case for pandas
    df_spark = df_spark.select(
        df_spark["FUSION_CUSTOMER_NAME"].alias("fusion_customer_name"),
        df_spark["ADDRESS_LINE_1"].alias("address_line_1"),
        df_spark["POSTAL_CODE"].alias("postal_code"),
        df_spark["CITY"].alias("city"),
        df_spark["COUNTRY"].alias("country"),
        df_spark["ID"].alias("id"),
        df_spark["SOURCE_SYSTEM"].alias("source_system")
    )

    # Collect to pandas DataFrame (safe for ~1000 rows)
    df = df_spark.toPandas()

    # Data normalization (skip if not needed)
    df["country"] = df["country"].apply(preprocess_country_names)
    df["fusion_customer_name"] = df["fusion_customer_name"].apply(preprocess_text)
    df["address_line_1"] = df["address_line_1"].apply(preprocess_text)
    df["city"] = df["city"].apply(preprocess_text)
    df["full_address"] = df["address_line_1"]

    # Clustering (fuzzy deduplication)
    combined_columns = ["fusion_customer_name", "full_address", "city", "country"]
    weights = [5, 5, 2, 1]
    threshold = 80

    cluster_ids, driver_ids, cluster_sizes, link_scores = weighted_clustering(df, combined_columns, weights, threshold)
    df["Cluster_Id"] = cluster_ids
    df["Driver_Id"] = driver_ids
    df["Cluster_Size"] = [cluster_sizes[cid] for cid in cluster_ids]
    df["LinkScore"] = link_scores

    # Convert back to Spark DataFrame for DLT output
    df_spark_out = spark.createDataFrame(df)
    return df_spark_out