# Deduplication Algorithms Testing

This notebook tests different approaches to remove duplicate articles from our RSS feed data.

## Approaches to Test

1. **Custom Implementation** - URL matching, title similarity, content similarity, fuzzy matching
2. **all-MiniLM-L6-v2** - Sentence transformer embeddings for semantic similarity

## Goals

- Compare effectiveness of different approaches
- Measure performance (time taken, duplicates found)
- Save duplicate pairs for manual validation
- Find the best approach for our use case


In [1]:
# Import required libraries
import json
import time
import pandas as pd
import numpy as np
from datetime import datetime
from difflib import SequenceMatcher
from urllib.parse import urlparse
import re
from collections import defaultdict
import os

# For sentence transformers
try:
    from sentence_transformers import SentenceTransformer
    from sklearn.metrics.pairwise import cosine_similarity
    SENTENCE_TRANSFORMERS_AVAILABLE = True
except ImportError:
    SENTENCE_TRANSFORMERS_AVAILABLE = False
    print("⚠️ sentence-transformers not available. Install with: pip install sentence-transformers scikit-learn")

print("✅ Libraries imported successfully")
print(f"📦 Sentence Transformers available: {SENTENCE_TRANSFORMERS_AVAILABLE}")


✅ Libraries imported successfully
📦 Sentence Transformers available: True


In [2]:
# Load RSS data
print("📂 Loading RSS data...")
with open('../data/rss/rss_data.json', 'r') as f:
    rss_data = json.load(f)

articles = rss_data['articles']
print(f"✅ Loaded {len(articles)} articles")

# Convert to DataFrame for easier processing
df = pd.DataFrame(articles)
print(f"📊 DataFrame shape: {df.shape}")
print(f"📝 Columns: {list(df.columns)}")

# Show sample data
print("\n📋 Sample article:")
sample = df.iloc[0]
for col in ['title', 'url', 'feed_name', 'published']:
    if col in sample:
        print(f"   {col}: {sample[col][:100]}...")


📂 Loading RSS data...
✅ Loaded 255 articles
📊 DataFrame shape: (255, 12)
📝 Columns: ['title', 'url', 'summary', 'content', 'published', 'author', 'feed_name', 'feed_category', 'feed_url', 'tags', 'guid', 'raw_entry']

📋 Sample article:
   title: Find out what’s new in the Gemini app in September’s Gemini Drop....
   url: https://blog.google/products/gemini/gemini-drop-september-2025/...
   feed_name: Google The Keyword...
   published: 2025-09-19T16:00:00...


In [3]:
# Helper functions for text processing
def clean_text(text):
    """Clean and normalize text for comparison"""
    if not text or pd.isna(text):
        return ""
    
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove common punctuation that might vary
    text = re.sub(r'[^\w\s]', '', text)
    
    return text.strip()

def normalize_url(url):
    """Normalize URL for comparison"""
    if not url or pd.isna(url):
        return ""
    
    try:
        parsed = urlparse(url)
        # Remove www, trailing slashes, fragments
        domain = parsed.netloc.replace('www.', '')
        path = parsed.path.rstrip('/')
        return f"{parsed.scheme}://{domain}{path}"
    except:
        return str(url).lower()

def calculate_similarity(text1, text2):
    """Calculate similarity between two texts using SequenceMatcher"""
    if not text1 or not text2:
        return 0.0
    return SequenceMatcher(None, text1, text2).ratio()

print("✅ Helper functions defined")


✅ Helper functions defined


In [4]:
# APPROACH 1: Custom Implementation
print("🔍 APPROACH 1: Custom Implementation")
print("=" * 50)

start_time = time.time()

# Prepare data
df['clean_title'] = df['title'].apply(clean_text)
df['clean_content'] = df['content'].apply(clean_text)
df['normalized_url'] = df['url'].apply(normalize_url)

# Find duplicates using multiple criteria
duplicates = []
seen_urls = set()
seen_titles = set()
seen_content = set()

# Group by normalized URL first (exact URL matches)
url_groups = df.groupby('normalized_url')
for url, group in url_groups:
    if len(group) > 1:
        for i, row1 in group.iterrows():
            for j, row2 in group.iterrows():
                if i < j:  # Avoid duplicate pairs
                    duplicates.append({
                        'type': 'exact_url',
                        'article1': {'index': i, 'title': row1['title'], 'url': row1['url'], 'feed': row1['feed_name']},
                        'article2': {'index': j, 'title': row2['title'], 'url': row2['url'], 'feed': row2['feed_name']},
                        'similarity': 1.0
                    })

# Find similar titles (fuzzy matching)
for i in range(len(df)):
    for j in range(i + 1, len(df)):
        row1, row2 = df.iloc[i], df.iloc[j]
        
        # Skip if already found as URL duplicate
        if (i, j) in [(d['article1']['index'], d['article2']['index']) for d in duplicates]:
            continue
            
        title_sim = calculate_similarity(row1['clean_title'], row2['clean_title'])
        
        if title_sim > 0.8:  # High title similarity
            duplicates.append({
                'type': 'similar_title',
                'article1': {'index': i, 'title': row1['title'], 'url': row1['url'], 'feed': row1['feed_name']},
                'article2': {'index': j, 'title': row2['title'], 'url': row2['url'], 'feed': row2['feed_name']},
                'similarity': title_sim
            })

# Find similar content (for articles with similar titles)
for i in range(len(df)):
    for j in range(i + 1, len(df)):
        row1, row2 = df.iloc[i], df.iloc[j]
        
        # Skip if already found
        if (i, j) in [(d['article1']['index'], d['article2']['index']) for d in duplicates]:
            continue
            
        # Only check content if titles are somewhat similar
        title_sim = calculate_similarity(row1['clean_title'], row2['clean_title'])
        if title_sim > 0.3:  # Some title similarity
            content_sim = calculate_similarity(row1['clean_content'], row2['clean_content'])
            
            if content_sim > 0.7:  # High content similarity
                duplicates.append({
                    'type': 'similar_content',
                    'article1': {'index': i, 'title': row1['title'], 'url': row1['url'], 'feed': row1['feed_name']},
                    'article2': {'index': j, 'title': row2['title'], 'url': row2['url'], 'feed': row2['feed_name']},
                    'similarity': content_sim,
                    'title_similarity': title_sim
                })

end_time = time.time()
processing_time = end_time - start_time

# Results
print(f"⏱️  Processing time: {processing_time:.2f} seconds")
print(f"🔍 Total duplicates found: {len(duplicates)}")

# Count by type
type_counts = defaultdict(int)
for dup in duplicates:
    type_counts[dup['type']] += 1

print(f"📊 Duplicates by type:")
for dup_type, count in type_counts.items():
    print(f"   {dup_type}: {count}")

# Save results
results = {
    'approach': 'custom_implementation',
    'timestamp': datetime.now().isoformat(),
    'processing_time_seconds': processing_time,
    'total_duplicates': len(duplicates),
    'duplicates_by_type': dict(type_counts),
    'duplicate_pairs': duplicates
}

os.makedirs('../data/deduplication', exist_ok=True)
with open('../data/deduplication/custom_implementation_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print(f"💾 Results saved to ../data/deduplication/custom_implementation_results.json")
print("✅ Custom implementation completed!")


🔍 APPROACH 1: Custom Implementation
⏱️  Processing time: 12.98 seconds
🔍 Total duplicates found: 5
📊 Duplicates by type:
   exact_url: 4
   similar_title: 1
💾 Results saved to ../data/deduplication/custom_implementation_results.json
✅ Custom implementation completed!


In [None]:
# APPROACH 2: all-MiniLM-L6-v2 Sentence Transformers
print("\n🔍 APPROACH 2: all-MiniLM-L6-v2 Sentence Transformers")
print("=" * 50)

if not SENTENCE_TRANSFORMERS_AVAILABLE:
    print("❌ Sentence transformers not available. Skipping this approach.")
    print("   Install with: pip install sentence-transformers scikit-learn")
else:
    start_time = time.time()
    
    # Load the model
    print("📥 Loading all-MiniLM-L6-v2 model...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Prepare text for embedding (combine title and content)
    print("🔄 Preparing text for embedding...")
    texts = []
    for _, row in df.iterrows():
        title = str(row['title']) if pd.notna(row['title']) else ""
        content = str(row['content']) if pd.notna(row['content']) else ""
        # Combine title and content with a separator
        combined_text = f"{title} [SEP] {content}"
        texts.append(combined_text)
    
    # Generate embeddings
    print("🧠 Generating embeddings...")
    embeddings = model.encode(texts, show_progress_bar=True)
    
    # Calculate similarity matrix
    print("📊 Calculating similarity matrix...")
    similarity_matrix = cosine_similarity(embeddings)
    
    # Find duplicates based on similarity threshold
    threshold = 0.6  # Adjust this threshold as needed
    duplicates_st = []
    
    print(f"🔍 Finding duplicates with similarity > {threshold}...")
    for i in range(len(similarity_matrix)):
        for j in range(i + 1, len(similarity_matrix)):
            similarity = similarity_matrix[i][j]
            
            if similarity > threshold:
                row1, row2 = df.iloc[i], df.iloc[j]
                duplicates_st.append({
                    'type': 'semantic_similarity',
                    'article1': {'index': i, 'title': row1['title'], 'url': row1['url'], 'feed': row1['feed_name']},
                    'article2': {'index': j, 'title': row2['title'], 'url': row2['url'], 'feed': row2['feed_name']},
                    'similarity': float(similarity)
                })
    
    end_time = time.time()
    processing_time = end_time - start_time
    
    # Results
    print(f"⏱️  Processing time: {processing_time:.2f} seconds")
    print(f"🔍 Total duplicates found: {len(duplicates_st)}")
    print(f"📊 Similarity threshold used: {threshold}")
    
    # Show some examples
    print(f"\n📋 Sample duplicates found:")
    for i, dup in enumerate(duplicates_st[:3]):
        print(f"   {i+1}. Similarity: {dup['similarity']:.3f}")
        print(f"      Article 1: {dup['article1']['title'][:60]}...")
        print(f"      Article 2: {dup['article2']['title'][:60]}...")
        print()
    
    # Save results
    results_st = {
        'approach': 'all_minilm_l6_v2',
        'timestamp': datetime.now().isoformat(),
        'processing_time_seconds': processing_time,
        'total_duplicates': len(duplicates_st),
        'similarity_threshold': threshold,
        'model_name': 'all-MiniLM-L6-v2',
        'duplicate_pairs': duplicates_st
    }
    
    with open('../data/deduplication/sentence_transformers_results.json', 'w') as f:
        json.dump(results_st, f, indent=2)
    
    print(f"💾 Results saved to ../data/deduplication/sentence_transformers_results.json")
    print("✅ Sentence transformers approach completed!")



🔍 APPROACH 2: all-MiniLM-L6-v2 Sentence Transformers
📥 Loading all-MiniLM-L6-v2 model...
🔄 Preparing text for embedding...
🧠 Generating embeddings...


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

📊 Calculating similarity matrix...
🔍 Finding duplicates with similarity > 0.5...
⏱️  Processing time: 4.55 seconds
🔍 Total duplicates found: 119
📊 Similarity threshold used: 0.5

📋 Sample duplicates found:
   1. Similarity: 0.885
      Article 1: Chrome: The browser you love, reimagined with AI...
      Article 2: Go behind the browser with Chrome’s new AI features...

   2. Similarity: 0.762
      Article 1: Chrome: The browser you love, reimagined with AI...
      Article 2: Google announces massive expansion of AI features in Chrome...

   3. Similarity: 0.863
      Article 1: Chrome: The browser you love, reimagined with AI...
      Article 2: Google stuffs Chrome full of AI features whether you like it...

💾 Results saved to ../data/deduplication/sentence_transformers_results.json
✅ Sentence transformers approach completed!


In [6]:
# COMPARISON AND ANALYSIS
print("\n📊 COMPARISON AND ANALYSIS")
print("=" * 50)

# Load results from both approaches
try:
    with open('../data/deduplication/custom_implementation_results.json', 'r') as f:
        custom_results = json.load(f)
    
    print("✅ Custom Implementation Results:")
    print(f"   ⏱️  Time: {custom_results['processing_time_seconds']:.2f}s")
    print(f"   🔍 Duplicates: {custom_results['total_duplicates']}")
    print(f"   📊 By type: {custom_results['duplicates_by_type']}")
    
except FileNotFoundError:
    print("❌ Custom implementation results not found")
    custom_results = None

if SENTENCE_TRANSFORMERS_AVAILABLE:
    try:
        with open('../data/deduplication/sentence_transformers_results.json', 'r') as f:
            st_results = json.load(f)
        
        print("\n✅ Sentence Transformers Results:")
        print(f"   ⏱️  Time: {st_results['processing_time_seconds']:.2f}s")
        print(f"   🔍 Duplicates: {st_results['total_duplicates']}")
        print(f"   📊 Threshold: {st_results['similarity_threshold']}")
        
    except FileNotFoundError:
        print("❌ Sentence transformers results not found")
        st_results = None
else:
    st_results = None

# Create comparison summary
comparison = {
    'timestamp': datetime.now().isoformat(),
    'total_articles_processed': len(df),
    'approaches': {}
}

if custom_results:
    comparison['approaches']['custom_implementation'] = {
        'processing_time_seconds': custom_results['processing_time_seconds'],
        'total_duplicates': custom_results['total_duplicates'],
        'duplicates_by_type': custom_results['duplicates_by_type'],
        'duplicates_per_second': custom_results['total_duplicates'] / custom_results['processing_time_seconds']
    }

if st_results:
    comparison['approaches']['sentence_transformers'] = {
        'processing_time_seconds': st_results['processing_time_seconds'],
        'total_duplicates': st_results['total_duplicates'],
        'similarity_threshold': st_results['similarity_threshold'],
        'duplicates_per_second': st_results['total_duplicates'] / st_results['processing_time_seconds']
    }

# Save comparison
with open('../data/deduplication/comparison_summary.json', 'w') as f:
    json.dump(comparison, f, indent=2)

print(f"\n💾 Comparison saved to ../data/deduplication/comparison_summary.json")
print("✅ Analysis completed!")



📊 COMPARISON AND ANALYSIS
✅ Custom Implementation Results:
   ⏱️  Time: 12.98s
   🔍 Duplicates: 5
   📊 By type: {'exact_url': 4, 'similar_title': 1}

✅ Sentence Transformers Results:
   ⏱️  Time: 4.03s
   🔍 Duplicates: 9
   📊 Threshold: 0.8

💾 Comparison saved to ../data/deduplication/comparison_summary.json
✅ Analysis completed!
