In [3]:
# Question: Advanced Deduplication Using Machine Learning
# Description: Implement ML-based deduplication based on feature similarity.




In [4]:
import pandas as pd

# Example dataset
data = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'name': ['Jon Smith', 'John Smith', 'Alice Brown', 'Alyce Brown'],
    'email': ['jon@example.com', 'john.smith@example.com', 'alice.b@example.com', 'alyce.b@example.com'],
})
from itertools import combinations

# Generate candidate pairs (all-vs-all, or use blocking for scale)
pairs = list(combinations(data.index, 2))
print(f"Total pairs: {len(pairs)}")
from rapidfuzz import fuzz
import numpy as np

def compute_features(pair):
    i, j = pair
    row_i, row_j = data.loc[i], data.loc[j]

    features = {
        'name_similarity': fuzz.token_sort_ratio(row_i['name'], row_j['name']) / 100,
        'email_similarity': fuzz.token_sort_ratio(row_i['email'], row_j['email']) / 100,
    }
    return features

# Build feature matrix
X = pd.DataFrame([compute_features(p) for p in pairs])
# Simulated labels: 1 = duplicate, 0 = non-duplicate
# For real use cases, label manually or with rules
y = [1, 0, 1, 0, 0, 0]  # e.g., [1] if names/emails are close
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict probabilities on test
probs = model.predict_proba(X_test)[:, 1]
from sklearn.metrics import classification_report

# Use a threshold to determine duplicates
y_pred = (probs > 0.5).astype(int)

print(classification_report(y_test, y_pred))


Total pairs: 6
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
