In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import difflib
from itertools import combinations

# Sample data with possible duplicates
data = [
    {'id': 1, 'name': 'John Smith', 'email': 'john.smith@example.com', 'age': 30},
    {'id': 2, 'name': 'Jon Smith', 'email': 'jon.smith@example.com', 'age': 31},
    {'id': 3, 'name': 'Mary Johnson', 'email': 'mary.j@example.com', 'age': 25},
    {'id': 4, 'name': 'Mary Jhonson', 'email': 'mary.johnson@example.com', 'age': 26},
    {'id': 5, 'name': 'Jake Brown', 'email': 'jake.brown@example.com', 'age': 40}
]

df = pd.DataFrame(data)

# Function to compute string similarity using difflib (ratio between 0 and 1)
def string_sim(str1, str2):
    return difflib.SequenceMatcher(None, str1, str2).ratio()

# Generate all pairs of records (by index)
pairs = list(combinations(df.index, 2))

features = []
labels = []

# Manually labeled duplicate pairs based on index
# (0,1) means df rows with index 0 and 1 are duplicates, similarly (2,3)
duplicate_pairs = {(0, 1), (2, 3)}

for i, j in pairs:
    rec1 = df.loc[i]
    rec2 = df.loc[j]
    
    name_similarity = string_sim(rec1['name'], rec2['name'])
    email_similarity = string_sim(rec1['email'], rec2['email'])
    age_difference = abs(rec1['age'] - rec2['age'])
    
    # Feature vector for the pair
    features.append([name_similarity, email_similarity, age_difference])
    
    # Label: 1 if duplicates, else 0
    labels.append(1 if (i, j) in duplicate_pairs or (j, i) in duplicate_pairs else 0)

X = np.array(features)
y = np.array(labels)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Train Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict on test set
y_pred = clf.predict(X_test)

# Show results
print("Features of test pairs:")
print(X_test)
print("True labels:", y_test)
print("Predicted labels:", y_pred)

Features of test pairs:
[[ 0.36363636  0.7        15.        ]
 [ 0.36363636  0.65        5.        ]
 [ 0.28571429  0.71111111  5.        ]]
True labels: [0 0 0]
Predicted labels: [0 0 0]
