In [1]:
# Question: Advanced Deduplication Using Machine Learning
# Description: Implement ML-based deduplication based on feature similarity.


import pandas as pd
import recordlinkage
from recordlinkage.preprocessing import clean
from sklearn.ensemble import RandomForestClassifier

# Sample dataset with possible duplicates
data = {
    'id': [1, 2, 3, 4],
    'name': ['John Smith', 'Jon Smith', 'Jane Doe', 'J. Doe'],
    'address': ['123 Elm St', '123 Elm Street', '456 Oak St', '456 Oak Street'],
    'phone': ['555-1234', '555-1234', '555-5678', '555-5678']
}
df = pd.DataFrame(data).set_index('id')

# Step 1: Preprocess (clean text)
df['name_clean'] = df['name'].str.lower().str.replace(r'\W', '', regex=True)
df['address_clean'] = df['address'].str.lower().str.replace(r'\W', '', regex=True)

# Step 2: Create candidate pairs for comparison using indexing
indexer = recordlinkage.Index()
indexer.full()  # All pairs
candidate_links = indexer.index(df)

# Step 3: Compare pairs on multiple features
compare = recordlinkage.Compare()

# Name similarity (string similarity)
compare.string('name_clean', 'name_clean', method='jarowinkler', threshold=0.85, label='name_sim')

# Address similarity
compare.string('address_clean', 'address_clean', method='jarowinkler', threshold=0.85, label='address_sim')

# Phone exact match
compare.exact('phone', 'phone', label='phone_exact')

features = compare.compute(candidate_links, df)

# Step 4: Label data for training (example labels, in real cases you need manually labeled data)
# Here, we create labels based on perfect match of phone numbers (as proxy)
features['is_duplicate'] = features['phone_exact']

# Step 5: Train classifier
X = features[['name_sim', 'address_sim', 'phone_exact']]
y = features['is_duplicate']

model = RandomForestClassifier(random_state=42)
model.fit(X, y)

# Step 6: Predict duplicates
features['predicted_duplicate'] = model.predict(X)

# Get pairs predicted as duplicates
duplicates = features[features['predicted_duplicate'] == 1]
print("Predicted duplicate pairs:")
print(duplicates.index.tolist())


ModuleNotFoundError: No module named 'recordlinkage'