STEP 1 : OUTPUT A CSV CONTAINING ALL USERS SUSPECTED FOR ABUSE USING RAPID FUZZ LOGIC

In [None]:
import pandas as pd
from rapidfuzz import fuzz
from collections import defaultdict
import itertools
import re

# Load your data
df = pd.read_csv("example_input_data.csv")  # Replace with your file path

# --- Address Normalization ---
def normalize_address(addr):
    addr = str(addr).lower()
    
    # Replace common abbreviations
    addr = re.sub(r"\bflat\s*no\b|\bflat\b", "flat", addr)
    addr = re.sub(r"\btower\b|\bblock\b", "tower", addr)
    addr = re.sub(r"\bsec\b|\bsector\b", "sector", addr)
    addr = re.sub(r"\bfloor\b", "floor", addr)
    addr = re.sub(r"\bapt\b|\bapartment\b", "apartment", addr)

    # Remove all punctuation except numbers/letters/space
    addr = re.sub(r"[^a-z0-9\s]", " ", addr)
    addr = re.sub(r"\s+", " ", addr).strip()  # collapse multiple spaces
    return addr

# --- Name Normalization ---
def normalize_name(name):
    name = str(name).lower()
    name = re.sub(r"[^a-z\s]", "", name)  # remove punctuation/numbers
    name = re.sub(r"\s+", " ", name).strip()
    return name

df["normalized_address"] = df["full_address"].apply(normalize_address)
df["normalized_name"] = df["customer_name"].apply(normalize_name)

# Group by locality
grouped = df.groupby("locality")

# Thresholds
ADDRESS_SIMILARITY_THRESHOLD = 88
NAME_SIMILARITY_THRESHOLD = 70  # a bit relaxed due to improved preprocessing

# Track seen pairs
already_seen_pairs = set()
results = []

# Comparison function
def is_potential_duplicate(addr1, addr2, name1, name2):
    addr_score = fuzz.token_sort_ratio(addr1, addr2)
    name_score = fuzz.partial_ratio(name1, name2)
    return addr_score >= ADDRESS_SIMILARITY_THRESHOLD and name_score >= NAME_SIMILARITY_THRESHOLD

# Main loop
for locality, group in grouped:
    records = group.to_dict(orient="records")
    
    for a, b in itertools.combinations(records, 2):
        id_a, id_b = str(a["customer"]), str(b["customer"])
        primary_id, secondary_id = sorted([id_a, id_b])

        pair_key = (primary_id, secondary_id)
        if pair_key in already_seen_pairs:
            continue

        if is_potential_duplicate(a["normalized_address"], b["normalized_address"],
                                  a["normalized_name"], b["normalized_name"]):
            if id_a == primary_id:
                primary_rec, secondary_rec = a, b
            else:
                primary_rec, secondary_rec = b, a

            results.append({
                "primary_id": primary_rec["customer"],
                "primary_name": primary_rec["customer_name"],
                "primary_address": primary_rec["full_address"],
                "secondary_id": secondary_rec["customer"],
                "secondary_name": secondary_rec["customer_name"],
                "secondary_address": secondary_rec["full_address"]
            })

            already_seen_pairs.add(pair_key)

# Save results
result_df = pd.DataFrame(results)
result_df.to_csv("example_suspected_offenders.csv", index=False)
print("Finished. Matches found:", len(result_df))

Finished. Matches found: 2847


STEP 2: USE SENTENCE TRANSFORMER TO CLASSIFY SUSPECTS AS "NO" (means not an offender), "PSEUDO" (means addresses match but names differ potentially indicating that some other family member is there) and "COMPLETE" (means that both names and addresses match completely).

In [None]:
import pandas as pd
import re
import numpy as np
from sentence_transformers import SentenceTransformer, util

# Load data
df = pd.read_csv("HCHECK_deduplicated_offenders.csv")

# --- Helper functions ---
def normalize_text(text):
    text = str(text).lower()
    text = re.sub(r'\b(flat|apt|apartment|tower|block|floor|no|house|h|sec|sector|road|rd|st|street|lane|ln|plot|pl)\b', '', text)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def normalize_name(name):
    name = str(name).lower()
    name = re.sub(r'[^a-z\s]', '', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name.split()[0] if name else ""  # Use first name only

# Apply normalization
df["primary_address_norm"] = df["primary_address"].apply(normalize_text)
df["secondary_address_norm"] = df["secondary_address"].apply(normalize_text)
df["primary_name_norm"] = df["primary_name"].apply(normalize_name)
df["secondary_name_norm"] = df["secondary_name"].apply(normalize_name)

# --- Embedding Model ---
model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate embeddings
address_texts = list(df["primary_address_norm"]) + list(df["secondary_address_norm"])
name_texts = list(df["primary_name_norm"]) + list(df["secondary_name_norm"])

address_embeddings = model.encode(address_texts, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
name_embeddings = model.encode(name_texts, batch_size=64, show_progress_bar=True, normalize_embeddings=True)

df["address_score"] = np.sum(
    address_embeddings[:len(df)] * address_embeddings[len(df):], axis=1
)
df["name_score"] = np.sum(
    name_embeddings[:len(df)] * name_embeddings[len(df):], axis=1
)

# --- Classification Logic ---
def classify(row):
    if row["address_score"] >= 0.89:
        return "complete" if row["name_score"] >= 0.75 else "pseudo"
    elif row["address_score"] >= 0.75 and row["name_score"] >= 0.80:
        return "pseudo"  # This is a safeguard for swapped name+address
    else:
        return "no"

df["offender_class"] = df.apply(classify, axis=1)

# Save final output
df[["primary_id", "primary_name", "primary_address",
    "secondary_id", "secondary_name", "secondary_address",
    "offender_class"]].to_csv("example_final_verified_offenders.csv", index=False)

print("✓ Classification completed with improved accuracy.")
# This cell outputs a csv by the name final_verified_offenders in which final verified offenders are there.

Batches: 100%|██████████| 89/89 [00:05<00:00, 16.00it/s]
Batches: 100%|██████████| 89/89 [00:03<00:00, 22.92it/s]

✓ Classification completed with improved accuracy.





In [None]:
import pandas as pd

# Load the final file
df = pd.read_csv("example_final_verified_offenders.csv")

id_label_df = pd.concat([
    df[["primary_id", "offender_class"]].rename(columns={"primary_id": "customer_id"}),
    df[["secondary_id", "offender_class"]].rename(columns={"secondary_id": "customer_id"})
])

id_label_df = id_label_df.drop_duplicates()

summary = id_label_df.groupby("offender_class")["customer_id"].nunique()

print(summary)


offender_class
complete    1725
no          1043
pseudo      1025
Name: customer_id, dtype: int64
