In [9]:
# Question: Advanced Deduplication Using Machine Learning
# Description: Implement ML-based deduplication based on feature similarity.




In [10]:
import pandas as pd
import recordlinkage
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# ----------------------------------------
# 1. Sample Data
# ----------------------------------------
data = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'name': ['John Smith', 'Jon Smith', 'Alice Johnson', 'Alyce Jonson'],
    'email': ['john@example.com', 'jon@example.com', 'alicej@gmail.com', 'alycej@gmail.com']
})
data.set_index('id', inplace=True)

# ----------------------------------------
# 2. Generate Candidate Pairs
# ----------------------------------------
indexer = recordlinkage.Index()
indexer.full()  # All combinations
candidate_pairs = indexer.index(data)

# ----------------------------------------
# 3. Compare Features
# ----------------------------------------
compare = recordlinkage.Compare()
compare.string('name', 'name', method='jarowinkler', label='name_sim')
compare.string('email', 'email', method='jarowinkler', label='email_sim')

features = compare.compute(candidate_pairs, data)

# ----------------------------------------
# 4. Define Labeled Pairs (Known Duplicates/Non-Duplicates)
# ----------------------------------------
raw_labels = [
    ((1, 2), 1),   # John Smith vs Jon Smith = duplicate
    ((3, 4), 1),   # Alice Johnson vs Alyce Jonson = duplicate
    ((1, 3), 0)    # John Smith vs Alice Johnson = not duplicate
]

# Normalize labels to match feature index format
formatted_labels = {
    tuple(sorted(pair)): label for pair, label in raw_labels
}

labels = pd.Series(formatted_labels)

# Keep only labels that exist in features
labels = labels[labels.index.isin(features.index)]

# Raise error if no matches (defensive coding)
if labels.empty:
    raise ValueError("No labeled pairs match the feature index. Check ID formats and candidate pairs.")

X = features.loc[labels.index]
y = labels

# ----------------------------------------
# 5. Train ML Model
# ----------------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

print(f"Model Accuracy: {model.score(X_test, y_test):.2f}")

# ----------------------------------------
# 6. Predict on All Candidate Pairs
# ----------------------------------------
predictions = model.predict(features)
predicted_duplicates = features[predictions == 1]

print("\nPredicted Duplicates:")
print(predicted_duplicates)




ValueError: No labeled pairs match the feature index. Check ID formats and candidate pairs.