In [7]:
# Question: Advanced Deduplication Using Machine Learning
# Description: Implement ML-based deduplication based on feature similarity.




In [8]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Step 1: Create sample data with duplicates
data = {
    'id': [1, 2, 3, 4, 5, 6],
    'name': ['Alice Johnson', 'Alicia Johnson', 'Bob Smith', 'Robert Smith', 'Charlie', 'Charles'],
    'email': ['alice@example.com', 'alice.j@example.com', 'bob@example.com', 'robert.smith@example.com', 'charlie@example.com', 'charles@example.com'],
    'age': [30, 31, 40, 39, 25, 26]
}
df = pd.DataFrame(data)

# Step 2: Generate candidate pairs (all pairs for simplicity, in practice use blocking to reduce)
pairs = []
for i in range(len(df)):
    for j in range(i+1, len(df)):
        pairs.append((df.iloc[i], df.iloc[j]))

# Step 3: Feature engineering - similarity metrics for pairs
def compute_similarity_features(row1, row2):
    # Name similarity via TF-IDF cosine similarity
    vectorizer = TfidfVectorizer().fit([row1['name'], row2['name']])
    tfidf1 = vectorizer.transform([row1['name']])
    tfidf2 = vectorizer.transform([row2['name']])
    name_sim = cosine_similarity(tfidf1, tfidf2)[0][0]
    
    # Email exact match (1 if exact same, else 0)
    email_match = 1 if row1['email'].lower() == row2['email'].lower() else 0
    
    # Age difference (absolute difference)
    age_diff = abs(row1['age'] - row2['age'])
    
    return pd.Series([name_sim, email_match, age_diff])

# Step 4: Build features DataFrame
features = []
for r1, r2 in pairs:
    features.append(compute_similarity_features(r1, r2))
features_df = pd.DataFrame(features, columns=['name_sim', 'email_match', 'age_diff'])

# Step 5: Create labels for training (manually labeled here)
# Label pairs as duplicates (1) or not (0)
# Alice Johnson and Alicia Johnson are duplicates
# Bob Smith and Robert Smith duplicates
# Charlie and Charles duplicates
labels = [
    1,  # Alice Johnson & Alicia Johnson
    0,  # Alice Johnson & Bob Smith
    0,  # Alice Johnson & Robert Smith
    0,  # Alice Johnson & Charlie
    0,  # Alice Johnson & Charles
    0,  # Alicia Johnson & Bob Smith
    0,  # Alicia Johnson & Robert Smith
    0,  # Alicia Johnson & Charlie
    0,  # Alicia Johnson & Charles
    1,  # Bob Smith & Robert Smith
    0,  # Bob Smith & Charlie
    0,  # Bob Smith & Charles
    0,  # Robert Smith & Charlie
    0,  # Robert Smith & Charles
    1   # Charlie & Charles
]

# Step 6: Train/test split
X_train, X_test, y_train, y_test = train_test_split(features_df, labels, test_size=0.3, random_state=42)

# Step 7: Train classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Step 8: Predict and evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

# Step 9: Use classifier to predict duplicates in all pairs (for demo)
features_df['predicted_duplicate'] = clf.predict(features_df)

# Combine pair info with predictions
results = []
for idx, (r1, r2) in enumerate(pairs):
    results.append({
        'id1': r1['id'],
        'name1': r1['name'],
        'id2': r2['id'],
        'name2': r2['name'],
        'predicted_duplicate': features_df.loc[idx, 'predicted_duplicate']
    })
results_df = pd.DataFrame(results)

print("\nPredicted Duplicate Pairs:")
print(results_df[results_df['predicted_duplicate'] == 1])


              precision    recall  f1-score   support

           0       0.60      1.00      0.75         3
           1       0.00      0.00      0.00         2

    accuracy                           0.60         5
   macro avg       0.30      0.50      0.38         5
weighted avg       0.36      0.60      0.45         5


Predicted Duplicate Pairs:
Empty DataFrame
Columns: [id1, name1, id2, name2, predicted_duplicate]
Index: []


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
