# Content Filtering and Spam Detection

This notebook implements various content filtering and processing techniques for our RSS feed data.

## Features to Test

1. **Spam Detection** - Using `mrm8488/bert-tiny-finetuned-sms-spam-detection` model
2. **Content Quality Filtering** - Length, readability, language detection
3. **Topic Classification** - Categorize articles by topic
4. **Relevance Scoring** - Score articles based on tech relevance
5. **Content Cleaning** - Remove unwanted content, normalize text

## Goals

- Filter out low-quality and spam content
- Improve content quality for newsletter generation
- Save filtered results in JSON format for analysis
- Test different filtering thresholds and approaches


In [1]:
# Import required libraries
import json
import time
import pandas as pd
import numpy as np
from datetime import datetime
import re
import os
from collections import defaultdict, Counter
import warnings
warnings.filterwarnings('ignore')

# For transformers and NLP
try:
    from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
    import torch
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    TRANSFORMERS_AVAILABLE = False
    print("⚠️ transformers not available. Install with: pip install transformers torch")

# For text processing
try:
    import nltk
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    NLTK_AVAILABLE = True
except ImportError:
    NLTK_AVAILABLE = False
    print("⚠️ nltk not available. Install with: pip install nltk")

# For language detection
try:
    from langdetect import detect, LangDetectException
    LANGDETECT_AVAILABLE = True
except ImportError:
    LANGDETECT_AVAILABLE = False
    print("⚠️ langdetect not available. Install with: pip install langdetect")

print("✅ Libraries imported successfully")
print(f"📦 Transformers available: {TRANSFORMERS_AVAILABLE}")
print(f"📦 NLTK available: {NLTK_AVAILABLE}")
print(f"📦 Language detection available: {LANGDETECT_AVAILABLE}")


✅ Libraries imported successfully
📦 Transformers available: True
📦 NLTK available: True
📦 Language detection available: True


In [2]:
# Load RSS data (full dataset, no deduplication)
print("📂 Loading RSS data...")
with open('../data/rss/rss_data.json', 'r') as f:
    rss_data = json.load(f)

articles = rss_data['articles']
print(f"✅ Loaded {len(articles)} articles")

# Convert to DataFrame for easier processing
df = pd.DataFrame(articles)
print(f"📊 DataFrame shape: {df.shape}")
print(f"📝 Columns: {list(df.columns)}")

# Show data distribution
print(f"\n📊 Data distribution:")
print(f"   📰 Total articles: {len(df)}")
print(f"   🏷️ Categories: {len(df['feed_category'].unique())}")
print(f"   📡 Feeds: {len(df['feed_name'].unique())}")

# Show sample data
print(f"\n📋 Sample article:")
sample = df.iloc[0]
for col in ['title', 'feed_name', 'feed_category', 'published']:
    if col in sample:
        print(f"   {col}: {sample[col][:80]}...")


📂 Loading RSS data...
✅ Loaded 255 articles
📊 DataFrame shape: (255, 12)
📝 Columns: ['title', 'url', 'summary', 'content', 'published', 'author', 'feed_name', 'feed_category', 'feed_url', 'tags', 'guid', 'raw_entry']

📊 Data distribution:
   📰 Total articles: 255
   🏷️ Categories: 6
   📡 Feeds: 23

📋 Sample article:
   title: Find out what’s new in the Gemini app in September’s Gemini Drop....
   feed_name: Google The Keyword...
   feed_category: company_blog...
   published: 2025-09-19T16:00:00...


In [3]:
# Helper functions for text processing
def clean_text(text):
    """Clean and normalize text"""
    if not text or pd.isna(text):
        return ""
    
    # Convert to string and strip
    text = str(text).strip()
    
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    
    return text.strip()

def get_text_length_stats(text):
    """Get text length statistics"""
    if not text:
        return {'char_count': 0, 'word_count': 0, 'sentence_count': 0}
    
    char_count = len(text)
    word_count = len(text.split())
    sentence_count = len(re.split(r'[.!?]+', text))
    
    return {
        'char_count': char_count,
        'word_count': word_count,
        'sentence_count': sentence_count
    }

def detect_language(text):
    """Detect language of text"""
    if not LANGDETECT_AVAILABLE or not text:
        return 'unknown'
    
    try:
        # Use first 500 chars for faster detection
        sample_text = text[:500]
        return detect(sample_text)
    except LangDetectException:
        return 'unknown'

def calculate_readability_score(text):
    """Simple readability score based on word/sentence ratio"""
    if not text:
        return 0
    
    sentences = re.split(r'[.!?]+', text)
    words = text.split()
    
    if len(sentences) == 0 or len(words) == 0:
        return 0
    
    avg_words_per_sentence = len(words) / len(sentences)
    
    # Simple scoring: lower is more readable
    if avg_words_per_sentence <= 10:
        return 'high'
    elif avg_words_per_sentence <= 20:
        return 'medium'
    else:
        return 'low'

print("✅ Helper functions defined")


✅ Helper functions defined


In [None]:
# APPROACH 1: Spam Detection using BERT
print("🔍 APPROACH 1: Spam Detection using BERT")
print("=" * 50)

if not TRANSFORMERS_AVAILABLE:
    print("❌ Transformers not available. Skipping spam detection.")
    print("   Install with: pip install transformers torch")
    spam_results = []
else:
    start_time = time.time()
    
    # Load the spam detection model
    print("📥 Loading BERT spam detection model...")
    try:
        spam_classifier = pipeline(
            "text-classification",
            model="mrm8488/bert-tiny-finetuned-sms-spam-detection",
            tokenizer="mrm8488/bert-tiny-finetuned-sms-spam-detection"
        )
        print("✅ Model loaded successfully")
    except Exception as e:
        print(f"❌ Error loading model: {e}")
        spam_classifier = None
    
    if spam_classifier:
        # Prepare text for spam detection (combine title + content)
        print("🔄 Preparing text for spam detection...")
        texts_for_spam = []
        for _, row in df.iterrows():
            title = clean_text(str(row['title'])) if pd.notna(row['title']) else ""
            content = clean_text(str(row['content'])) if pd.notna(row['content']) else ""
            # Combine title and content
            combined_text = f"{title} {content}".strip()
            # Limit length for BERT (512 tokens max)
            if len(combined_text) > 500:
                combined_text = combined_text[:500] + "..."
            texts_for_spam.append(combined_text)
        
        # Run spam detection
        print("🧠 Running spam detection...")
        spam_results = []
        
        for i, text in enumerate(texts_for_spam):
            if text.strip():  # Only process non-empty text
                try:
                    result = spam_classifier(text)
                    spam_results.append({
                        'index': i,
                        'text': text[:200] + "..." if len(text) > 200 else text,
                        'prediction': result[0]['label'],
                        'confidence': result[0]['score']
                    })
                except Exception as e:
                    spam_results.append({
                        'index': i,
                        'text': text[:200] + "..." if len(text) > 200 else text,
                        'prediction': 'error',
                        'confidence': 0.0,
                        'error': str(e)
                    })
            else:
                spam_results.append({
                    'index': i,
                    'text': '',
                    'prediction': 'empty',
                    'confidence': 0.0
                })
        
        end_time = time.time()
        processing_time = end_time - start_time
        
        # Analyze results
        predictions = [r['prediction'] for r in spam_results]
        prediction_counts = Counter(predictions)
        
        print(f"⏱️  Processing time: {processing_time:.2f} seconds")
        print(f"🔍 Spam detection results:")
        for pred, count in prediction_counts.items():
            print(f"   {pred}: {count}")
        
        # Show some examples
        spam_examples = [r for r in spam_results if r['prediction'] == 'spam']
        if spam_examples:
            print(f"\n📋 Sample spam detected ({len(spam_examples)} total):")
            for i, example in enumerate(spam_examples[:3]):
                print(f"   {i+1}. Confidence: {example['confidence']:.3f}")
                print(f"      Text: {example['text']}")
                print()
        
        # Save results
        spam_analysis = {
            'approach': 'bert_spam_detection',
            'model': 'mrm8488/bert-tiny-finetuned-sms-spam-detection',
            'timestamp': datetime.now().isoformat(),
            'processing_time_seconds': processing_time,
            'total_articles': len(df),
            'prediction_counts': dict(prediction_counts),
            'spam_results': spam_results
        }
        
        os.makedirs('../data/filtering', exist_ok=True)
        with open('../data/filtering/spam_detection_results.json', 'w') as f:
            json.dump(spam_analysis, f, indent=2)
        
        print(f"💾 Results saved to ../data/filtering/spam_detection_results.json")
        print("✅ Spam detection completed!")
    else:
        spam_results = []


🔍 APPROACH 1: Spam Detection using BERT
📥 Loading BERT spam detection model...


config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/17.6M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/324 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use mps:0


✅ Model loaded successfully
🔄 Preparing text for spam detection...
🧠 Running spam detection...
⏱️  Processing time: 21.19 seconds
🔍 Spam detection results:
   LABEL_0: 217
   LABEL_1: 38
💾 Results saved to ../data/filtering/spam_detection_results.json
✅ Spam detection completed!


In [None]:
# APPROACH 2: Content Quality Filtering
print("\n🔍 APPROACH 2: Content Quality Filtering")
print("=" * 50)

start_time = time.time()

# Analyze content quality for each article
quality_results = []

for i, row in df.iterrows():
    title = clean_text(str(row['title'])) if pd.notna(row['title']) else ""
    content = clean_text(str(row['content'])) if pd.notna(row['content']) else ""
    summary = clean_text(str(row['summary'])) if pd.notna(row['summary']) else ""
    
    # Get text statistics
    title_stats = get_text_length_stats(title)
    content_stats = get_text_length_stats(content)
    summary_stats = get_text_length_stats(summary)
    
    # Detect language
    combined_text = f"{title} {content}".strip()
    language = detect_language(combined_text)
    
    # Calculate readability
    readability = calculate_readability_score(combined_text)
    
    # Quality scoring
    quality_score = 0
    quality_issues = []
    
    # Check minimum content length
    if content_stats['word_count'] < 50:
        quality_score -= 2
        quality_issues.append('too_short_content')
    
    # Check title quality
    if title_stats['word_count'] < 3:
        quality_score -= 2
        quality_issues.append('poor_title')
    
    # Check language
    if language != 'en':
        quality_score -= 1
        quality_issues.append('non_english')
    
    # Check readability
    if readability == 'low':
        quality_score -= 1
        quality_issues.append('poor_readability')
    
    # Check for empty content
    if not content.strip():
        quality_score -= 3
        quality_issues.append('empty_content')
    
    # Check for suspicious patterns
    if re.search(r'(click here|buy now|limited time|act now)', combined_text.lower()):
        quality_score -= 2
        quality_issues.append('promotional_language')
    
    # Check for excessive capitalization
    if len(re.findall(r'[A-Z]{3,}', combined_text)) > 3:
        quality_score -= 1
        quality_issues.append('excessive_caps')
    
    # Assign quality level
    if quality_score >= 0:
        quality_level = 'high'
    elif quality_score >= -2:
        quality_level = 'medium'
    else:
        quality_level = 'low'
    
    quality_results.append({
        'index': i,
        'title': title[:100] + "..." if len(title) > 100 else title,
        'quality_score': quality_score,
        'quality_level': quality_level,
        'quality_issues': quality_issues,
        'language': language,
        'readability': readability,
        'title_stats': title_stats,
        'content_stats': content_stats,
        'summary_stats': summary_stats,
        'feed_name': row['feed_name'],
        'feed_category': row['feed_category']
    })

end_time = time.time()
processing_time = end_time - start_time

# Analyze results
quality_levels = [r['quality_level'] for r in quality_results]
quality_counts = Counter(quality_levels)

print(f"⏱️  Processing time: {processing_time:.2f} seconds")
print(f"🔍 Content quality results:")
for level, count in quality_counts.items():
    print(f"   {level}: {count}")

# Show quality issues
all_issues = []
for r in quality_results:
    all_issues.extend(r['quality_issues'])
issue_counts = Counter(all_issues)

print(f"\n📊 Quality issues found:")
for issue, count in issue_counts.most_common():
    print(f"   {issue}: {count}")

# Show examples of low quality content
low_quality = [r for r in quality_results if r['quality_level'] == 'low']
if low_quality:
    print(f"\n📋 Sample low quality articles ({len(low_quality)} total):")
    for i, example in enumerate(low_quality[:3]):
        print(f"   {i+1}. Score: {example['quality_score']}, Issues: {example['quality_issues']}")
        print(f"      Title: {example['title']}")
        print()

# Save results
quality_analysis = {
    'approach': 'content_quality_filtering',
    'timestamp': datetime.now().isoformat(),
    'processing_time_seconds': processing_time,
    'total_articles': len(df),
    'quality_distribution': dict(quality_counts),
    'issue_distribution': dict(issue_counts),
    'quality_results': quality_results
}

with open('../data/filtering/content_quality_results.json', 'w') as f:
    json.dump(quality_analysis, f, indent=2)

print(f"💾 Results saved to ../data/filtering/content_quality_results.json")
print("✅ Content quality filtering completed!")



🔍 APPROACH 2: Content Quality Filtering
⏱️  Processing time: 11.08 seconds
🔍 Content quality results:
   low: 196
   medium: 46
   high: 13

📊 Quality issues found:
   too_short_content: 202
   empty_content: 194
   excessive_caps: 28
   poor_readability: 27
   non_english: 6
   poor_title: 1

📋 Sample low quality articles (196 total):
   1. Score: -5, Issues: ['too_short_content', 'empty_content']
      Title: Find out what’s new in the Gemini app in September’s Gemini Drop.

   2. Score: -5, Issues: ['too_short_content', 'empty_content']
      Title: DOJ's remedies go significantly beyond the Court's ruling and would harm publishers and advertisers.

   3. Score: -3, Issues: ['too_short_content', 'poor_readability']
      Title: Go behind the browser with Chrome’s new AI features

💾 Results saved to ../data/filtering/content_quality_results.json
✅ Content quality filtering completed!


In [None]:
# COMPREHENSIVE FILTERING AND ANALYSIS
print("\n📊 COMPREHENSIVE FILTERING AND ANALYSIS")
print("=" * 50)

# Combine spam detection and quality filtering results
filtered_articles = []
filter_stats = {
    'total_articles': len(df),
    'spam_filtered': 0,
    'low_quality_filtered': 0,
    'passed_filters': 0,
    'filter_reasons': defaultdict(int)
}

for i, row in df.iterrows():
    article_data = {
        'index': i,
        'title': row['title'],
        'url': row['url'],
        'feed_name': row['feed_name'],
        'feed_category': row['feed_category'],
        'published': row['published'],
        'content': row['content'],
        'summary': row['summary']
    }
    
    # Check spam detection results
    spam_result = None
    if spam_results and i < len(spam_results):
        spam_result = spam_results[i]
        if spam_result['prediction'] == 'spam' and spam_result['confidence'] > 0.7:
            filter_stats['spam_filtered'] += 1
            filter_stats['filter_reasons']['spam'] += 1
            continue
    
    # Check quality filtering results
    quality_result = None
    if i < len(quality_results):
        quality_result = quality_results[i]
        if quality_result['quality_level'] == 'low':
            filter_stats['low_quality_filtered'] += 1
            filter_stats['filter_reasons']['low_quality'] += 1
            continue
    
    # Article passed all filters
    article_data['spam_analysis'] = spam_result
    article_data['quality_analysis'] = quality_result
    filtered_articles.append(article_data)
    filter_stats['passed_filters'] += 1

print(f"📊 Filtering Results:")
print(f"   📰 Total articles: {filter_stats['total_articles']}")
print(f"   🚫 Spam filtered: {filter_stats['spam_filtered']}")
print(f"   📉 Low quality filtered: {filter_stats['low_quality_filtered']}")
print(f"   ✅ Passed filters: {filter_stats['passed_filters']}")

print(f"\n📋 Filter reasons:")
for reason, count in filter_stats['filter_reasons'].items():
    print(f"   {reason}: {count}")

# Calculate filtering efficiency
filtering_rate = (filter_stats['spam_filtered'] + filter_stats['low_quality_filtered']) / filter_stats['total_articles'] * 100
print(f"\n📈 Filtering efficiency: {filtering_rate:.1f}% of articles filtered")

# Save comprehensive results
comprehensive_results = {
    'timestamp': datetime.now().isoformat(),
    'filter_stats': dict(filter_stats),
    'filtering_rate_percent': filtering_rate,
    'filtered_articles': filtered_articles,
    'spam_detection_summary': {
        'model_used': 'mrm8488/bert-tiny-finetuned-sms-spam-detection',
        'total_analyzed': len(spam_results) if spam_results else 0,
        'spam_detected': filter_stats['spam_filtered']
    },
    'quality_filtering_summary': {
        'total_analyzed': len(quality_results),
        'low_quality_filtered': filter_stats['low_quality_filtered']
    }
}

with open('../data/filtering/comprehensive_filtering_results.json', 'w') as f:
    json.dump(comprehensive_results, f, indent=2)

print(f"\n💾 Comprehensive results saved to ../data/filtering/comprehensive_filtering_results.json")
print("✅ Comprehensive filtering analysis completed!")



📊 COMPREHENSIVE FILTERING AND ANALYSIS
📊 Filtering Results:
   📰 Total articles: 255
   🚫 Spam filtered: 0
   📉 Low quality filtered: 196
   ✅ Passed filters: 59

📋 Filter reasons:
   low_quality: 196

📈 Filtering efficiency: 76.9% of articles filtered

💾 Comprehensive results saved to ../data/filtering/comprehensive_filtering_results.json
✅ Comprehensive filtering analysis completed!


In [5]:
# APPROACH 3: Ad Detection using BERT
print("\n🔍 APPROACH 3: Ad Detection using BERT")
print("=" * 50)

if not TRANSFORMERS_AVAILABLE:
    print("❌ Transformers not available. Skipping ad detection.")
    print("   Install with: pip install transformers torch")
    ad_results = []
else:
    start_time = time.time()
    
    # Load the BERT ad detection model
    print("📥 Loading BERT ad detection model...")
    try:
        ad_classifier = pipeline(
            "text-classification",
            model="bondarchukb/bert-ads-classification",
            tokenizer="bondarchukb/bert-ads-classification"
        )
        print("✅ BERT ad detection model loaded successfully")
    except Exception as e:
        print(f"❌ Error loading model: {e}")
        raise Exception(f"Failed to load ad detection model: {e}")
    
    if ad_classifier:
        # Prepare text for ad detection (combine title + content)
        print("🔄 Preparing text for ad detection...")
        texts_for_ads = []
        for _, row in df.iterrows():
            title = clean_text(str(row['title'])) if pd.notna(row['title']) else ""
            content = clean_text(str(row['content'])) if pd.notna(row['content']) else ""
            # Combine title and content
            combined_text = f"{title} {content}".strip()
            # Limit length for RoBERTa (512 tokens max)
            if len(combined_text) > 500:
                combined_text = combined_text[:500] + "..."
            texts_for_ads.append(combined_text)
        
        # Run ad detection
        print("🧠 Running ad detection...")
        ad_results = []
        
        for i, text in enumerate(texts_for_ads):
            if text.strip():  # Only process non-empty text
                try:
                    result = ad_classifier(text)
                    ad_results.append({
                        'index': i,
                        'text': text[:200] + "..." if len(text) > 200 else text,
                        'prediction': result[0]['label'],
                        'confidence': result[0]['score']
                    })
                except Exception as e:
                    ad_results.append({
                        'index': i,
                        'text': text[:200] + "..." if len(text) > 200 else text,
                        'prediction': 'error',
                        'confidence': 0.0,
                        'error': str(e)
                    })
            else:
                ad_results.append({
                    'index': i,
                    'text': '',
                    'prediction': 'empty',
                    'confidence': 0.0
                })
        
        end_time = time.time()
        processing_time = end_time - start_time
        
        # Analyze results
        predictions = [r['prediction'] for r in ad_results]
        prediction_counts = Counter(predictions)
        
        print(f"⏱️  Processing time: {processing_time:.2f} seconds")
        print(f"🔍 Ad detection results:")
        for pred, count in prediction_counts.items():
            print(f"   {pred}: {count}")
        
        # Show some examples
        ad_examples = [r for r in ad_results if r['prediction'] == 'ad']
        if ad_examples:
            print(f"\n📋 Sample ads detected ({len(ad_examples)} total):")
            for i, example in enumerate(ad_examples[:3]):
                print(f"   {i+1}. Confidence: {example['confidence']:.3f}")
                print(f"      Text: {example['text']}")
                print()
        
        # Save results
        ad_analysis = {
            'approach': 'bert_ad_detection',
            'model': 'bondarchukb/bert-ads-classification',
            'timestamp': datetime.now().isoformat(),
            'processing_time_seconds': processing_time,
            'total_articles': len(df),
            'prediction_counts': dict(prediction_counts),
            'ad_results': ad_results
        }
        
        with open('../data/filtering/ad_detection_results.json', 'w') as f:
            json.dump(ad_analysis, f, indent=2)
        
        print(f"💾 Results saved to ../data/filtering/ad_detection_results.json")
        print("✅ Ad detection completed!")
    else:
        ad_results = []



🔍 APPROACH 3: Ad Detection using BERT
📥 Loading BERT ad detection model...


config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use mps:0


✅ BERT ad detection model loaded successfully
🔄 Preparing text for ad detection...
🧠 Running ad detection...
⏱️  Processing time: 22.45 seconds
🔍 Ad detection results:
   LABEL_1: 170
   LABEL_0: 85
💾 Results saved to ../data/filtering/ad_detection_results.json
✅ Ad detection completed!


In [4]:
# APPROACH 4: News Classification using RoBERTa
print("\n🔍 APPROACH 4: News Classification using RoBERTa")
print("=" * 50)

if not TRANSFORMERS_AVAILABLE:
    print("❌ Transformers not available. Skipping news classification.")
    print("   Install with: pip install transformers torch")
    news_results = []
else:
    start_time = time.time()
    
    # Load the RoBERTa news classification model
    print("📥 Loading RoBERTa news classification model...")
    try:
        news_classifier = pipeline(
            "text-classification",
            model="resul-ai/roberta-news-classifier",
            tokenizer="resul-ai/roberta-news-classifier"
        )
        print("✅ RoBERTa news classification model loaded successfully")
    except Exception as e:
        print(f"❌ Error loading model: {e}")
        raise Exception(f"Failed to load news classification model: {e}")
    
    if news_classifier:
        # Prepare text for news classification
        print("🔄 Preparing text for news classification...")
        texts_to_classify = []
        for i, row in df.iterrows():
            # Combine title and content for better classification
            text = f"{row['title']} {row.get('summary', '')} {row.get('content', '')}"
            cleaned_text = clean_text(text)[:500]  # Limit to 500 chars for efficiency
            texts_to_classify.append(cleaned_text)
        
        # Run news classification
        print("🧠 Running news classification...")
        news_results = []
        prediction_counts = defaultdict(int)
        
        for i, text in enumerate(texts_to_classify):
            try:
                # Get classification result
                result = news_classifier(text)
                
                # Extract prediction
                if isinstance(result, list) and len(result) > 0:
                    prediction = result[0]
                    label = prediction['label']
                    score = prediction['score']
                else:
                    label = "unknown"
                    score = 0.0
                
                # Store result
                news_results.append({
                    'index': i,
                    'title': df.iloc[i]['title'],
                    'feed_name': df.iloc[i]['feed_name'],
                    'feed_category': df.iloc[i]['feed_category'],
                    'text_analyzed': text[:100] + "..." if len(text) > 100 else text,
                    'predicted_category': label,
                    'confidence_score': score,
                    'is_news': label in ['teknoloji', 'siyaset', 'dunya', 'ekonomi', 'saglik', 'spor', 'kultur']  # Turkish labels from the model
                })
                
                prediction_counts[label] += 1
                
            except Exception as e:
                print(f"   ⚠️ Error classifying article {i}: {e}")
                news_results.append({
                    'index': i,
                    'title': df.iloc[i]['title'],
                    'feed_name': df.iloc[i]['feed_name'],
                    'feed_category': df.iloc[i]['feed_category'],
                    'text_analyzed': text[:100] + "..." if len(text) > 100 else text,
                    'predicted_category': 'error',
                    'confidence_score': 0.0,
                    'is_news': False
                })
                prediction_counts['error'] += 1
        
        processing_time = time.time() - start_time
        
        # Display results
        print(f"⏱️  Processing time: {processing_time:.2f} seconds")
        print("🔍 News classification results:")
        for label, count in sorted(prediction_counts.items(), key=lambda x: x[1], reverse=True):
            percentage = (count / len(df)) * 100
            print(f"   {label}: {count} ({percentage:.1f}%)")
        
        # Calculate news vs non-news statistics
        news_count = sum(1 for r in news_results if r['is_news'])
        non_news_count = len(news_results) - news_count
        
        print(f"\n📊 News vs Non-News Summary:")
        print(f"   📰 News articles: {news_count} ({(news_count/len(df)*100):.1f}%)")
        print(f"   📄 Non-news articles: {non_news_count} ({(non_news_count/len(df)*100):.1f}%)")
        
        # Show top categories
        print(f"\n🏷️ Top News Categories:")
        tech_articles = [r for r in news_results if r['predicted_category'] == 'teknoloji']
        if tech_articles:
            print(f"   🔧 Technology: {len(tech_articles)} articles")
        
        politics_articles = [r for r in news_results if r['predicted_category'] == 'siyaset']
        if politics_articles:
            print(f"   🏛️ Politics: {len(politics_articles)} articles")
        
        world_articles = [r for r in news_results if r['predicted_category'] == 'dunya']
        if world_articles:
            print(f"   🌍 World: {len(world_articles)} articles")
        
        # Save results
        news_analysis = {
            'approach': 'roberta_news_classification',
            'model': 'resul-ai/roberta-news-classifier',
            'timestamp': datetime.now().isoformat(),
            'processing_time_seconds': processing_time,
            'total_articles': len(df),
            'prediction_counts': dict(prediction_counts),
            'news_statistics': {
                'total_news': news_count,
                'total_non_news': non_news_count,
                'news_percentage': (news_count/len(df)*100),
                'non_news_percentage': (non_news_count/len(df)*100)
            },
            'news_results': news_results
        }
        
        # Create news classification directory
        os.makedirs('../data/filtering', exist_ok=True)
        
        with open('../data/filtering/news_classification_results.json', 'w') as f:
            json.dump(news_analysis, f, indent=2)
        
        print(f"💾 Results saved to ../data/filtering/news_classification_results.json")
        print("✅ News classification completed!")
    else:
        news_results = []



🔍 APPROACH 4: News Classification using RoBERTa
📥 Loading RoBERTa news classification model...


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

Device set to use mps:0


✅ RoBERTa news classification model loaded successfully
🔄 Preparing text for news classification...
🧠 Running news classification...
⏱️  Processing time: 32.03 seconds
🔍 News classification results:
   LABEL_6: 169 (66.3%)
   LABEL_2: 63 (24.7%)
   LABEL_1: 13 (5.1%)
   LABEL_3: 10 (3.9%)

📊 News vs Non-News Summary:
   📰 News articles: 0 (0.0%)
   📄 Non-news articles: 255 (100.0%)

🏷️ Top News Categories:
💾 Results saved to ../data/filtering/news_classification_results.json
✅ News classification completed!


In [None]:
# UPDATED COMPREHENSIVE FILTERING AND ANALYSIS
print("\n📊 UPDATED COMPREHENSIVE FILTERING AND ANALYSIS")
print("=" * 50)

# Combine spam detection, quality filtering, ad detection, and news classification results
filtered_articles = []
filter_stats = {
    'total_articles': len(df),
    'spam_filtered': 0,
    'low_quality_filtered': 0,
    'ad_filtered': 0,
    'non_news_filtered': 0,
    'passed_filters': 0,
    'filter_reasons': defaultdict(int)
}

for i, row in df.iterrows():
    article_data = {
        'index': i,
        'title': row['title'],
        'url': row['url'],
        'feed_name': row['feed_name'],
        'feed_category': row['feed_category'],
        'published': row['published'],
        'content': row['content'],
        'summary': row['summary']
    }
    
    # Check spam detection results
    spam_result = None
    if spam_results and i < len(spam_results):
        spam_result = spam_results[i]
        if spam_result['prediction'] == 'LABEL_1' and spam_result['confidence'] > 0.7:
            filter_stats['spam_filtered'] += 1
            filter_stats['filter_reasons']['spam'] += 1
            continue
    
    # Check ad detection results
    ad_result = None
    if ad_results and i < len(ad_results):
        ad_result = ad_results[i]
        if ad_result['prediction'] == 'ad' and ad_result['confidence'] > 0.7:
            filter_stats['ad_filtered'] += 1
            filter_stats['filter_reasons']['ad'] += 1
            continue
    
    # Check quality filtering results
    quality_result = None
    if i < len(quality_results):
        quality_result = quality_results[i]
        if quality_result['quality_level'] == 'low':
            filter_stats['low_quality_filtered'] += 1
            filter_stats['filter_reasons']['low_quality'] += 1
            continue
    
    # Check news classification results
    news_result = None
    if news_results and i < len(news_results):
        news_result = news_results[i]
        if news_result['is_news'] == False:
            filter_stats['non_news_filtered'] += 1
            filter_stats['filter_reasons']['non_news'] += 1
            continue
    
    # Article passed all filters
    article_data['spam_analysis'] = spam_result
    article_data['ad_analysis'] = ad_result
    article_data['quality_analysis'] = quality_result
    article_data['news_analysis'] = news_result
    filtered_articles.append(article_data)
    filter_stats['passed_filters'] += 1

print(f"📊 Updated Filtering Results:")
print(f"   📰 Total articles: {filter_stats['total_articles']}")
print(f"   🚫 Spam filtered: {filter_stats['spam_filtered']}")
print(f"   📢 Ad filtered: {filter_stats['ad_filtered']}")
print(f"   📉 Low quality filtered: {filter_stats['low_quality_filtered']}")
print(f"   📄 Non-news filtered: {filter_stats['non_news_filtered']}")
print(f"   ✅ Passed filters: {filter_stats['passed_filters']}")

print(f"\n📋 Filter reasons:")
for reason, count in filter_stats['filter_reasons'].items():
    print(f"   {reason}: {count}")

# Calculate filtering efficiency
total_filtered = filter_stats['spam_filtered'] + filter_stats['low_quality_filtered'] + filter_stats['ad_filtered'] + filter_stats['non_news_filtered']
filtering_rate = total_filtered / filter_stats['total_articles'] * 100
print(f"\n📈 Total filtering efficiency: {filtering_rate:.1f}% of articles filtered")

# Save updated comprehensive results
updated_comprehensive_results = {
    'timestamp': datetime.now().isoformat(),
    'filter_stats': dict(filter_stats),
    'filtering_rate_percent': filtering_rate,
    'filtered_articles': filtered_articles,
    'spam_detection_summary': {
        'model_used': 'mrm8488/bert-tiny-finetuned-sms-spam-detection',
        'total_analyzed': len(spam_results) if spam_results else 0,
        'spam_detected': filter_stats['spam_filtered']
    },
    'ad_detection_summary': {
        'model_used': '0x7o/roberta-base-ad-detector',
        'total_analyzed': len(ad_results) if ad_results else 0,
        'ads_detected': filter_stats['ad_filtered']
    },
    'quality_filtering_summary': {
        'total_analyzed': len(quality_results),
        'low_quality_filtered': filter_stats['low_quality_filtered']
    },
    'news_classification_summary': {
        'model_used': 'resul-ai/roberta-news-classifier',
        'total_analyzed': len(news_results) if news_results else 0,
        'non_news_filtered': filter_stats['non_news_filtered']
    }
}

with open('../data/filtering/updated_comprehensive_filtering_results.json', 'w') as f:
    json.dump(updated_comprehensive_results, f, indent=2)

print(f"\n💾 Updated comprehensive results saved to ../data/filtering/updated_comprehensive_filtering_results.json")
print("✅ Updated comprehensive filtering analysis completed!")
