In [None]:
# Question: Advanced Deduplication Using Machine Learning
# Description: Implement ML-based deduplication based on feature similarity.




In [2]:
import random
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

# Step 1: Load dataset
data = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
docs = data.data[:500]  # Use a subset for speed

# Step 2: Simulate duplicates by modifying text
def simulate_typo(text):
    return re.sub(r'\b(\w+)\b', lambda m: m.group(0)[:-1] if len(m.group(0)) > 3 and random.random() < 0.2 else m.group(0), text)

original = docs[:250]
duplicates = [simulate_typo(doc) for doc in original]

# Positive class: duplicate pairs
pos_pairs = list(zip(original, duplicates))
pos_labels = [1] * len(pos_pairs)

# Negative class: random unrelated pairs
neg_pairs = [(original[i], original[j]) for i in range(100) for j in range(i+1, i+2)]
neg_labels = [0] * len(neg_pairs)

# Combine
all_pairs = pos_pairs + neg_pairs
all_labels = pos_labels + neg_labels

# Step 3: Feature extraction using TF-IDF + Cosine Similarity
texts1 = [a for a, b in all_pairs]
texts2 = [b for a, b in all_pairs]

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = vectorizer.fit_transform(texts1 + texts2)

tfidf_1 = tfidf_matrix[:len(texts1)]
tfidf_2 = tfidf_matrix[len(texts1):]

cosine_sims = [cosine_similarity(tfidf_1[i], tfidf_2[i])[0][0] for i in range(len(texts1))]
features = pd.DataFrame({'cosine_sim': cosine_sims})

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, all_labels, test_size=0.3, random_state=42)

# Step 5: Model training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 6: Prediction & Evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.94      1.00      0.97        30
           1       1.00      0.97      0.99        75

    accuracy                           0.98       105
   macro avg       0.97      0.99      0.98       105
weighted avg       0.98      0.98      0.98       105

