In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np

# Sample data with possible duplicates
data = {
    'id': [1, 2, 3, 4],
    'name': ['Apple Inc.', 'Apple Incorporated', 'Google LLC', 'Gooogle'],
    'location': ['USA', 'United States', 'USA', 'U.S.']
}

df = pd.DataFrame(data)

# Step 1: Generate pairs of records for comparison
from itertools import combinations

pairs = list(combinations(df.index, 2))
records = []

for i, j in pairs:
    record1 = df.loc[i]
    record2 = df.loc[j]
    records.append({
        'id_1': record1['id'],
        'id_2': record2['id'],
        'name_1': record1['name'],
        'name_2': record2['name'],
        'location_1': record1['location'],
        'location_2': record2['location'],
    })

pairs_df = pd.DataFrame(records)

# Step 2: Feature engineering (TF-IDF cosine similarity)
def compute_similarity(col1, col2):
    combined = col1 + col2
    vectorizer = TfidfVectorizer().fit(combined)
    vec1 = vectorizer.transform(col1)
    vec2 = vectorizer.transform(col2)
    similarity = [cosine_similarity(v1, v2)[0][0] for v1, v2 in zip(vec1, vec2)]
    return similarity

pairs_df['name_similarity'] = compute_similarity(pairs_df['name_1'], pairs_df['name_2'])
pairs_df['location_similarity'] = compute_similarity(pairs_df['location_1'], pairs_df['location_2'])

# Step 3: Dummy labels for supervised training (in real use, you'd label examples)
# Let's say row 0 and 1 are duplicates; others are not
pairs_df['is_duplicate'] = [1, 0, 0, 1, 0, 0]

# Step 4: Train a model
X = pairs_df[['name_similarity', 'location_similarity']]
y = pairs_df['is_duplicate']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

# Step 5: Predict duplicates
pairs_df['predicted_duplicate'] = model.predict(X)

# Display final output
print("Pairwise similarity and duplicate prediction:")
print(pairs_df[['id_1', 'id_2', 'name_similarity', 'location_similarity', 'predicted_duplicate']])


Pairwise similarity and duplicate prediction:
   id_1  id_2  name_similarity  location_similarity  predicted_duplicate
0     1     2         0.271235                  0.0                    0
1     1     3         0.000000                  0.0                    0
2     1     4         0.000000                  0.0                    0
3     2     3         0.000000                  0.0                    0
4     2     4         0.000000                  0.0                    0
5     3     4         0.000000                  0.0                    0
