In [3]:
# Question: Advanced Deduplication Using Machine Learning
# Description: Implement ML-based deduplication based on feature similarity.




In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Sample dataset
data = {
    'id': [1, 2, 3, 4],
    'name': ['John Smith', 'Jon Smyth', 'Alice Johnson', 'Alicia Jonson'],
    'email': ['john@example.com', 'jon@example.com', 'alice@example.com', 'alicia@example.com'],
}

df = pd.DataFrame(data)

# Generate pairs for comparison (all pairs)
pairs = []
for i in range(len(df)):
    for j in range(i+1, len(df)):
        pairs.append((i, j))

# Extract features for pairs
features = []
labels = []  # For demonstration, you’d need true labels (1=duplicate, 0=not)

for i, j in pairs:
    name_1, name_2 = df.loc[i, 'name'], df.loc[j, 'name']
    email_1, email_2 = df.loc[i, 'email'], df.loc[j, 'email']

    # Fuzzy matching scores for names and emails
    name_ratio = fuzz.ratio(name_1, name_2)
    email_ratio = fuzz.ratio(email_1, email_2)

    features.append([name_ratio, email_ratio])

    # For example purposes, label pairs as duplicate if name similarity > 80 and email similarity > 80
    label = 1 if name_ratio > 80 and email_ratio > 80 else 0
    labels.append(label)

X = pd.DataFrame(features, columns=['name_similarity', 'email_similarity'])
y = labels

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train classifier
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Predict duplicates on test set
y_pred = clf.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

# Identify duplicates in original pairs
duplicates = []
for idx, (i, j) in enumerate(pairs):
    pred = clf.predict([features[idx]])[0]
    if pred == 1:
        duplicates.append((df.loc[i, 'id'], df.loc[j, 'id']))

print("Duplicate pairs detected (by id):", duplicates)


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2

Duplicate pairs detected (by id): [(1, 2), (3, 4)]


