In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np

# 1. Sample Data (Replace with your actual data loading)
data = pd.DataFrame({
    'id': [1, 2, 3, 4, 5, 6],
    'name': ['Alice Smith', 'Bob Johnson', 'Alise Smithe', 'Bob Jonson', 'Charlie Brown', 'Charlie Brown'],
    'email': ['alice.smith@example.com', 'bob.johnson@example.com', 'alise.smithe@example.com', 'bob.jonson@example.com', 'charlie.brown@example.com', 'charlie.brown@example.com'],
    'address': ['123 Main St', '456 Oak Ave', '123 Main Street', '456 Oak Avenue', '789 Pine Ln', '789 Pine Lane']
})

# 2. Feature Engineering
# Combine relevant columns into a single text feature for simplicity
data['combined_features'] = data['name'] + ' ' + data['email'] + ' ' + data['address']

# Option 1: Using TF-IDF for text similarity
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['combined_features'])

# 3. Similarity Calculation

# Option 1: Cosine Similarity on TF-IDF vectors
similarity_matrix_tfidf = cosine_similarity(tfidf_matrix)

# 4. Deduplication using Clustering (Unsupervised)
# Using Agglomerative Clustering based on TF-IDF similarity
n_clusters = 3 # Adjust based on expected number of unique entities
distance_matrix_tfidf = 1 - similarity_matrix_tfidf
clustering_tfidf = AgglomerativeClustering(n_clusters=n_clusters, linkage='average')
cluster_labels_tfidf = clustering_tfidf.fit_predict(distance_matrix_tfidf)

data['cluster_tfidf'] = cluster_labels_tfidf

print("Deduplication using Agglomerative Clustering (TF-IDF):")
print(data[['name', 'email', 'address', 'cluster_tfidf']])

# 5. Deduplication using Classification (Supervised - requires labeled data)
# Let's create some synthetic labels for demonstration
# In a real scenario, you would have labeled pairs of duplicates/non-duplicates
pairs = []
labels = []
n_records = len(data)
for i in range(n_records):
    for j in range(i + 1, n_records):
        pairs.append((i, j))
        # Simple rule-based labeling for demonstration
        if data['name'][i].split()[0] == data['name'][j].split()[0] and \
           data['address'][i].split()[0] == data['address'][j].split()[0]:
            labels.append(1) # Duplicate
        else:
            labels.append(0) # Not a duplicate

pairs_df = pd.DataFrame(pairs, columns=['record_index_1', 'record_index_2'])
labels_df = pd.DataFrame(labels, columns=['is_duplicate'])
pair_features = []

for index1, index2 in pairs:
    # Create feature vector for each pair (you can get creative here)
    combined_feature_vector = np.concatenate((tfidf_matrix[index1].toarray().flatten(), tfidf_matrix[index2].toarray().flatten()))
    pair_features.append(combined_feature_vector)

X = np.array(pair_features)
y = labels_df['is_duplicate'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("\nClassification Report for Duplicate Detection:")
print(classification_report(y_test, y_pred))

# Identifying duplicate groups based on the classifier
duplicate_groups = {}
for i, (index1, index2) in enumerate(pairs):
    if model.predict(X[i].reshape(1, -1))[0] == 1:
        if index1 not in duplicate_groups:
            duplicate_groups[index1] = [index1]
        if index2 not in duplicate_groups:
            duplicate_groups[index2] = [index2]
        if index2 not in duplicate_groups[index1]:
            duplicate_groups[index1].append(index2)
        if index1 not in duplicate_groups[index2]:
            duplicate_groups[index2].append(index1)

final_duplicate_sets = []
seen = set()
for group_indices in duplicate_groups.values():
    group_tuple = tuple(sorted(group_indices))
    if group_tuple not in seen:
        final_duplicate_sets.append(list(group_tuple))
        seen.add(group_tuple)

print("\nIdentified Duplicate Groups (based on classification):")
for group in final_duplicate_sets:
    print([data['name'][i] for i in group])

Deduplication using Agglomerative Clustering (TF-IDF):
            name                      email          address  cluster_tfidf
0    Alice Smith    alice.smith@example.com      123 Main St              0
1    Bob Johnson    bob.johnson@example.com      456 Oak Ave              2
2   Alise Smithe   alise.smithe@example.com  123 Main Street              0
3     Bob Jonson     bob.jonson@example.com   456 Oak Avenue              2
4  Charlie Brown  charlie.brown@example.com      789 Pine Ln              1
5  Charlie Brown  charlie.brown@example.com    789 Pine Lane              1

Classification Report for Duplicate Detection:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3


Identified Duplicate Groups (based on classification):
