In [None]:
import pandas as pd
import re
from textblob import TextBlob
import numpy as np
from transformers import pipeline

In [None]:
real_df = pd.read_csv('../data/True.csv')
fake_df = pd.read_csv('../data/Fake.csv')

# Create mapping for subject categories
subject_mapping = {
    # Real news categories
    'politicsNews': 'Politics',
    'worldnews': 'WorldNews',
    
    # Fake news categories
    'politics': 'Politics',
    'Government News': 'Politics',
    'US_News': 'Politics',
    'left-news': 'Politics',
    'News': 'WorldNews',
    'Middle-east': 'WorldNews'
}

# Apply mapping to both dataframes
real_df['subject'] = real_df['subject'].map(subject_mapping)
fake_df['subject'] = fake_df['subject'].map(subject_mapping)

# Verify the mapping
print("Real news subjects after mapping:")
print(real_df['subject'].value_counts())
print("\nFake news subjects after mapping:")
print(fake_df['subject'].value_counts())

# Add label column to distinguish real vs fake news
real_df['label'] = 1  # Real news
fake_df['label'] = 0  # Fake news

# Union the dataframes
df = pd.concat([real_df, fake_df], ignore_index=True)

print(f"Combined dataset shape: {df.shape}")

# Remove Reuters-style prefixes from text (e.g., "WASHINGTON (Reuters) - ")
def remove_reuters_prefix(text):
    if not isinstance(text, str):
        return text
    # Pattern matches: CITY (Reuters) - or CITY, STATE (Reuters) -
    # Examples: WASHINGTON (Reuters) -, Beijing (Reuters) -, NEW YORK (Reuters) -
    pattern = r'^[A-Z][A-Za-z\s,]+\(Reuters\)\s*-\s*'
    return re.sub(pattern, '', text)

df['text'] = df['text'].apply(remove_reuters_prefix)

print("Removed Reuters prefixes from text")

# Step 1: Calculate repost_count BEFORE removing duplicates
df['repost_count'] = df.groupby(['title', 'text'])['title'].transform('count')

# Step 2: Sort by date and keep earliest (convert date to datetime first)
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.sort_values('date')

# Remove duplicates, keeping the first (earliest) occurrence
df = df.drop_duplicates(subset=['title', 'text'], keep='first')

print(f"Shape after removing duplicates: {df.shape}")

# Initialize emotion classifier with MPS (Metal Performance Shaders) for M3 Max
print("Loading emotion detection model...")
emotion_classifier = pipeline("text-classification", 
                              model="j-hartmann/emotion-english-distilroberta-base",
                              top_k=None,
                              device="mps",  # Use Metal GPU on Mac
                              batch_size=128,
                              truncation=True,
                              max_length=512)

# Add Stylometric Features for TEXT
def capital_ratio(text):
    if not isinstance(text, str) or len(text) == 0:
        return 0
    letters = [c for c in text if c.isalpha()]
    if len(letters) == 0:
        return 0
    return sum(1 for c in letters if c.isupper()) / len(letters)

def avg_sentence_length(text):
    if not isinstance(text, str) or len(text) == 0:
        return 0
    sentences = re.split(r'[.!?]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    if len(sentences) == 0:
        return 0
    total_words = sum(len(s.split()) for s in sentences)
    return total_words / len(sentences)

df['text_exclamation_count'] = df['text'].apply(lambda x: x.count('!') if isinstance(x, str) else 0)
df['text_capital_ratio'] = df['text'].apply(capital_ratio)
df['text_avg_sentence_length'] = df['text'].apply(avg_sentence_length)

# Add Stylometric Features for TITLE
df['title_exclamation_count'] = df['title'].apply(lambda x: x.count('!') if isinstance(x, str) else 0)
df['title_capital_ratio'] = df['title'].apply(capital_ratio)

# Add Sentiment Features using TextBlob
def get_sentiment_features(text):
    if not isinstance(text, str) or len(text) == 0:
        return 0, 0
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

df['text_sentiment_polarity'] = df['text'].apply(lambda x: get_sentiment_features(x)[0])
df['text_sentiment_subjectivity'] = df['text'].apply(lambda x: get_sentiment_features(x)[1])
df['title_sentiment_polarity'] = df['title'].apply(lambda x: get_sentiment_features(x)[0])
df['title_sentiment_subjectivity'] = df['title'].apply(lambda x: get_sentiment_features(x)[1])

# Optimized batch emotion detection function
def get_emotion_features_batch(texts):
    """Process texts in batches for much faster inference"""
    results = []
    
    # Filter out invalid texts and track indices
    valid_texts = []
    valid_indices = []
    
    for idx, text in enumerate(texts):
        if isinstance(text, str) and len(text) > 0:
            valid_texts.append(text)
            valid_indices.append(idx)
    
    # Process all valid texts at once
    if valid_texts:
        try:
            batch_emotions = emotion_classifier(valid_texts)
            
            # Map results back to original indices
            emotion_dict = {}
            for i, emotions in enumerate(batch_emotions):
                top_emotion = max(emotions, key=lambda x: x['score'])
                emotional_intensity = max([e['score'] for e in emotions])
                
                emotion_dict[valid_indices[i]] = (
                    top_emotion['label'],
                    top_emotion['score'],
                    emotional_intensity
                )
        except Exception as e:
            print(f"Batch processing error: {e}")
            emotion_dict = {}
    else:
        emotion_dict = {}
    
    # Build results list with defaults for invalid texts
    for idx in range(len(texts)):
        if idx in emotion_dict:
            results.append(emotion_dict[idx])
        else:
            results.append(('neutral', 0.0, 0.0))
    
    return results

# Process TEXT emotions in batches
print("Analyzing emotions in text (using batch processing)...")
text_list = df['text'].tolist()
batch_size = 128
all_text_emotions = []

for i in range(0, len(text_list), batch_size):
    batch = text_list[i:i+batch_size]
    batch_results = get_emotion_features_batch(batch)
    all_text_emotions.extend(batch_results)
    
    if (i // batch_size + 1) % 10 == 0:
        print(f"Processed {i + len(batch)}/{len(text_list)} texts...")

df['text_emotion'] = [x[0] for x in all_text_emotions]
df['text_emotion_score'] = [x[1] for x in all_text_emotions]
df['text_emotional_intensity'] = [x[2] for x in all_text_emotions]

# Process TITLE emotions in batches
print("Analyzing emotions in titles (using batch processing)...")
title_list = df['title'].tolist()
all_title_emotions = []

for i in range(0, len(title_list), batch_size):
    batch = title_list[i:i+batch_size]
    batch_results = get_emotion_features_batch(batch)
    all_title_emotions.extend(batch_results)
    
    if (i // batch_size + 1) % 10 == 0:
        print(f"Processed {i + len(batch)}/{len(title_list)} titles...")

df['title_emotion'] = [x[0] for x in all_title_emotions]
df['title_emotion_score'] = [x[1] for x in all_title_emotions]
df['title_emotional_intensity'] = [x[2] for x in all_title_emotions]

# Display summary of new features
print("\n=== New Features Summary ===")
feature_cols = ['text_exclamation_count', 'text_capital_ratio', 'text_avg_sentence_length',
                'text_sentiment_polarity', 'text_sentiment_subjectivity', 'text_emotional_intensity',
                'title_exclamation_count', 'title_capital_ratio',
                'title_sentiment_polarity', 'title_sentiment_subjectivity', 'title_emotional_intensity',
                'repost_count']
print(df[feature_cols].describe())

# Show emotion distribution
print("\n=== Text Emotion Distribution ===")
print(df['text_emotion'].value_counts())
print("\n=== Title Emotion Distribution ===")
print(df['title_emotion'].value_counts())

# Show comparison between fake and real news
print("\n=== Feature Comparison: Fake vs Real News ===")
print("\nFake News (label=0):")
print(df[df['label']==0][feature_cols].mean())
print("\nReal News (label=1):")
print(df[df['label']==1][feature_cols].mean())

df.head()

In [None]:
df.to_parquet('../data/combined_news_with_features.parquet', index=False)

In [None]:
from collections import Counter

fake_words = Counter(" ".join(df[df["label"]==0]["text"]).split())
real_words = Counter(" ".join(df[df["label"]==1]["text"]).split())

exclusive_fake = set(fake_words) - set(real_words)
exclusive_real = set(real_words) - set(fake_words)

print("Fake sample words:", list(exclusive_fake)[:20])
print("Real sample words :", list(exclusive_real)[:20])

In [None]:
print(len(exclusive_fake), len(exclusive_real))