In [14]:
import pandas as pd
import re
from textblob import TextBlob
import numpy as np
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import contractions
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy.sparse import hstack, csr_matrix

# Preprocessing 1

In [None]:
real_df = pd.read_csv('../data/True.csv')
fake_df = pd.read_csv('../data/Fake.csv')

# Create mapping for subject categories
subject_mapping = {
    # Real news categories
    'politicsNews': 'Politics',
    'worldnews': 'WorldNews',
    
    # Fake news categories
    'politics': 'Politics',
    'Government News': 'Politics',
    'US_News': 'Politics',
    'left-news': 'Politics',
    'News': 'WorldNews',
    'Middle-east': 'WorldNews'
}

# Apply mapping to both dataframes
real_df['subject'] = real_df['subject'].map(subject_mapping)
fake_df['subject'] = fake_df['subject'].map(subject_mapping)

# Add label column to distinguish real vs fake news
real_df['label'] = 1  # Real news
fake_df['label'] = 0  # Fake news

# Union the dataframes
df = pd.concat([real_df, fake_df], ignore_index=True)

print(f"Combined dataset shape: {df.shape}")

# Remove Reuters-style prefixes from text (e.g., "WASHINGTON (Reuters) - ")
def remove_reuters_prefix(text):
    if not isinstance(text, str):
        return text
    pattern = r'^[A-Z][A-Za-z\s,]+\(Reuters\)\s*-\s*'
    return re.sub(pattern, '', text)

df['text'] = df['text'].apply(remove_reuters_prefix)

print("Removed Reuters prefixes from text")

# Calculate repost_count
df['repost_count'] = df.groupby(['title', 'text'])['title'].transform('count')

# Sort by date and keep earliest
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.sort_values('date')

# Remove duplicates
df = df.drop_duplicates(subset=['title', 'text'], keep='first')

print(f"Shape after removing duplicates: {df.shape}")

# Emotion Classifier Pipeline
emotion_classifier = pipeline("text-classification", 
                              model="j-hartmann/emotion-english-distilroberta-base",
                              top_k=None,
                              device="mps",
                              batch_size=128,
                              truncation=True,
                              max_length=512)

# Features from Text and Title
def capital_ratio(text):
    if not isinstance(text, str) or len(text) == 0:
        return 0
    letters = [c for c in text if c.isalpha()]
    if len(letters) == 0:
        return 0
    return sum(1 for c in letters if c.isupper()) / len(letters)

def avg_sentence_length(text):
    if not isinstance(text, str) or len(text) == 0:
        return 0
    sentences = re.split(r'[.!?]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    if len(sentences) == 0:
        return 0
    total_words = sum(len(s.split()) for s in sentences)
    return total_words / len(sentences)

df['text_exclamation_count'] = df['text'].apply(lambda x: x.count('!') if isinstance(x, str) else 0)
df['text_capital_ratio'] = df['text'].apply(capital_ratio)
df['text_avg_sentence_length'] = df['text'].apply(avg_sentence_length)

df['title_exclamation_count'] = df['title'].apply(lambda x: x.count('!') if isinstance(x, str) else 0)
df['title_capital_ratio'] = df['title'].apply(capital_ratio)

# Add Sentiment Features
def get_sentiment_features(text):
    if not isinstance(text, str) or len(text) == 0:
        return 0, 0
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

df['text_sentiment_polarity'] = df['text'].apply(lambda x: get_sentiment_features(x)[0])
df['text_sentiment_subjectivity'] = df['text'].apply(lambda x: get_sentiment_features(x)[1])
df['title_sentiment_polarity'] = df['title'].apply(lambda x: get_sentiment_features(x)[0])
df['title_sentiment_subjectivity'] = df['title'].apply(lambda x: get_sentiment_features(x)[1])

def get_emotion_features(texts):
    """Process texts in batches for much faster inference"""
    results = []
    
    # Filter out invalid texts and track indices
    valid_texts = []
    valid_indices = []
    
    for idx, text in enumerate(texts):
        if isinstance(text, str) and len(text) > 0:
            valid_texts.append(text)
            valid_indices.append(idx)
    
    # Process all valid texts at once
    if valid_texts:
        try:
            batch_emotions = emotion_classifier(valid_texts)
            
            # Map results back to original indices
            emotion_dict = {}
            for i, emotions in enumerate(batch_emotions):
                top_emotion = max(emotions, key=lambda x: x['score'])
                emotional_intensity = max([e['score'] for e in emotions])
                
                emotion_dict[valid_indices[i]] = (
                    top_emotion['label'],
                    top_emotion['score'],
                    emotional_intensity
                )
        except Exception as e:
            print(f"Batch processing error: {e}")
            emotion_dict = {}
    else:
        emotion_dict = {}
    
    # Build results list with defaults for invalid texts
    for idx in range(len(texts)):
        if idx in emotion_dict:
            results.append(emotion_dict[idx])
        else:
            results.append(('neutral', 0.0, 0.0))
    
    return results

# Process Text emotions
text_list = df['text'].tolist()
batch_size = 128
all_text_emotions = []

for i in range(0, len(text_list), batch_size):
    batch = text_list[i:i+batch_size]
    batch_results = get_emotion_features(batch)
    all_text_emotions.extend(batch_results)
    
    if (i // batch_size + 1) % 10 == 0:
        print(f"Processed {i + len(batch)}/{len(text_list)} texts...")

df['text_emotion'] = [x[0] for x in all_text_emotions]
df['text_emotion_score'] = [x[1] for x in all_text_emotions]
df['text_emotional_intensity'] = [x[2] for x in all_text_emotions]

# Process Title emotions
print("Analyzing emotions in titles (using batch processing)...")
title_list = df['title'].tolist()
all_title_emotions = []

for i in range(0, len(title_list), batch_size):
    batch = title_list[i:i+batch_size]
    batch_results = get_emotion_features(batch)
    all_title_emotions.extend(batch_results)
    
    if (i // batch_size + 1) % 10 == 0:
        print(f"Processed {i + len(batch)}/{len(title_list)} titles...")

df['title_emotion'] = [x[0] for x in all_title_emotions]
df['title_emotion_score'] = [x[1] for x in all_title_emotions]
df['title_emotional_intensity'] = [x[2] for x in all_title_emotions]

feature_cols = ['text_exclamation_count', 'text_capital_ratio', 'text_avg_sentence_length',
                'text_sentiment_polarity', 'text_sentiment_subjectivity', 'text_emotional_intensity',
                'title_exclamation_count', 'title_capital_ratio',
                'title_sentiment_polarity', 'title_sentiment_subjectivity', 'title_emotional_intensity',
                'repost_count']
print(df[feature_cols].describe())

df.head()

In [None]:
df.to_parquet('../data/combined_news_with_features.parquet', index=False)

# Preprocessing 2

In [15]:
df = pd.read_parquet('../data/combined_news_with_features.parquet')

In [None]:
df['repost_count'] = df['repost_count'].values - 1 # Adjust repost_count to exclude original post

In [None]:
df.drop(columns=['title_emotional_intensity', 'text_emotional_intensity'], inplace=True) # Intensity and Score were identical, removing Intensity, keeping Score

In [None]:
"""
    Clean Text
"""

def expand_contractions(text):
    # Protect common abbreviations before expansion
    protected = {
        r'\bu\.?s\.?\b': '<USABBREV>',      
        r'\bu\.?k\.?\b': '<UKABBREV>',      
        r'\bph\.?d\.?\b': '<PHDABBREV>',    
        r'\bdr\.': '<DRABBREV>',            
        r'\bmr\.': '<MRABBREV>',            
        r'\bmrs\.': '<MRSABBREV>',          
        r'\bms\.': '<MSABBREV>',            
    }
    
    # Replace abbreviations with placeholders
    for pattern, placeholder in protected.items():
        text = re.sub(pattern, placeholder, text, flags=re.IGNORECASE)
    
    # Expand contractions
    text = contractions.fix(text)
    
    # Restore abbreviations (keep them as single tokens)
    text = text.replace('<USABBREV>', 'us')
    text = text.replace('<UKABBREV>', 'uk')
    text = text.replace('<PHDABBREV>', 'phd')
    text = text.replace('<DRABBREV>', 'dr')
    text = text.replace('<MRABBREV>', 'mr')
    text = text.replace('<MRSABBREV>', 'mrs')
    text = text.replace('<MSABBREV>', 'ms')

    return text

def clean_text_optimized(text):
    if not isinstance(text, str): 
        return ""
    
    # Lowercase
    text = text.lower()

    # Expand contractions
    text = expand_contractions(text)

    # Remove CDATA artifacts
    text = re.sub(r'<!\[cdata\[.*?\]\]>', '', text)
    
    # Remove Reuters/Wire specific patterns
    text = re.sub(r'\bfactbox\b', '', text)
    text = re.sub(r'\breuters\b', '', text)
    text = re.sub(r'\bwire\b', '', text)
    
    # Remove Getty Images and photo credits
    text = re.sub(r'/?getty\s*images?', '', text)
    text = re.sub(r'image\s*via\s*\w+', '', text)
    text = re.sub(r'photo\s*by\s*\w+', '', text)
    text = re.sub(r'photo\s*:\s*\w+', '', text)
    text = re.sub(r'credit\s*:\s*\w+', '', text)
    text = re.sub(r'\bflickr\b', '', text)
    text = re.sub(r'\bafp\b', '', text)
    text = re.sub(r'\bap\s*photo\b', '', text)
    
    # Remove javascript code artifacts
    text = re.sub(r'var\s+.*?;', '', text) 
    text = re.sub(r'\w+\.getelementbyid\(.*?\)', '', text)
    text = re.sub(r'\w+\.createelement\(.*?\)', '', text)
        
    # HTML/URL removal
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'<.*?>', '', text) 
    
    # Remove bracketed content (metadata, captions, citations)
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'\[[^\]]*\]', '', text)
    text = re.sub(r'\{[^}]*\}', '', text)
    
    # Handle Numbers
    text = re.sub(r'\d+', ' <NUM> ', text)
    
    # Punctuation (keep only ! ? ...)
    text = re.sub(r'\.\.\.+', ' <ELLIPSIS> ', text)
    text = re.sub(r'\!+', ' <EXCLAIM> ', text)
    text = re.sub(r'\?+', ' <QUESTION> ', text)
    
    text = re.sub(r'[^\w\s<>]', ' ', text)
    
    text = text.replace('<ELLIPSIS>', ' ... ')
    text = text.replace('<EXCLAIM>', ' ! ')
    text = text.replace('<QUESTION>', ' ? ')
    
    # Cleanup whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


df['clean_title'] = df['title'].apply(clean_text_optimized)
df['clean_selftext'] = df['text'].apply(clean_text_optimized)

# These are words that represent technical errors or specific topic biases
bias_stop_words = [
    # 1. JAVASCRIPT/CODE ARTIFACTS
    'getelementbyid', 'createelement', 'getelementsbytagname', 'insertbefore', 
    'parentnode', 'var', 'src', 'jssdk', 'cdata', 'fjs', 'pjs',
    'filessupport', 'acr', 'curator', 'follow', 'pic', 'twitter',
    'featured', 'subscribe', 'subscribing',
    
    # 2. IMAGE/PHOTO CREDITS (DATA LEAKAGE)
    'getty', 'images', 'image', 'via', 'flickr', 'afp', 'photo',
    'somodevilla', 'angerer', 'mcnamee', 'raedle', 'henningsen', 
    'gage', 'helton', 'finicum', 'lavoy', 'hammonds', 'shutterstock',
    
    # 3. NEWS WIRE LEAKAGE
    'reuters', 'factbox', 'tmsnrt', 'wire', 'associated', 'press',
    
    # 4. INTERNATIONAL TOPIC BIAS
    'rakhine', 'zuma', 'puigdemont', 'suu', 'kyi', 'mnangagwa', 'anc', 
    'rajoy', 'odinga', 'kuczynski', 'kurz', 'aung', 'barnier', 'barzani', 
    'ramaphosa', 'babis', 'fpo', 'marawi', 'kem', 'sokha', 'pis', 'koike', 
    'farc', 'schaeuble', 'carles', 'ldp', 'asean', 'lighthizer', 'odebrecht',
    'nasralla', 'cnrp', 'najib', 'cameroon', 'dlamini', 'ano', 'kiir', 
    'pdvsa', 'navalny', 'dup', 'museveni', 'obrador', 'kabila', 'mladic', 
    'harare', 'fujimori', 'yangon',
]


# TF-IDF Vectorization
vectorizer = TfidfVectorizer(
    max_features=50000,
    min_df=10,
    max_df=0.95,
    ngram_range = (1, 3),
    strip_accents='unicode',
    lowercase=False,
    stop_words=bias_stop_words,
    sublinear_tf=True,
    token_pattern=r"(?u)\b\w+\b|<NUM>|!|\?|\.\.\."
)


df['full_text'] = df['clean_title'] + " " + df['clean_selftext']
df = df[df['full_text'].str.strip() != '']  # Remove empty texts

X = vectorizer.fit_transform(df['full_text'])

print(f"Feature matrix shape: {X.shape}")

Cleaning texts...
Vectorizing...
Feature matrix shape: (39100, 50000)


In [19]:
df.to_parquet('../data/data_cleaned_text.parquet', index=False)

In [None]:
# Handle Date Column (Cyclical Encoding)
df['date_obj'] = pd.to_datetime(df['date'], errors='coerce')

df['month'] = df['date_obj'].dt.month
df['day_of_week'] = df['date_obj'].dt.dayofweek

# Sine/Cosine Transformation
# Month (1-12)
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
# Day of Week (0-6)
df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

# Scale Numerical Features
num_cols = [
    'repost_count', 
    'text_exclamation_count', 'text_capital_ratio', 'text_avg_sentence_length',
    'title_exclamation_count', 'title_capital_ratio',
    'text_sentiment_polarity', 'text_sentiment_subjectivity',
    'title_sentiment_polarity', 'title_sentiment_subjectivity',
    'text_emotion_score', 'title_emotion_score',
    'month_sin', 'month_cos', 'day_sin', 'day_cos'
]

df[num_cols] = df[num_cols].fillna(0)

scaler = StandardScaler()
X_dense_scaled = scaler.fit_transform(df[num_cols])

X_dense_sparse = csr_matrix(X_dense_scaled)

# One-Hot Encode Categorical Features
cat_cols = ['subject', 'text_emotion', 'title_emotion']

df[cat_cols] = df[cat_cols].fillna('unknown')

encoder = OneHotEncoder(drop='first', sparse_output=True)
X_cat_sparse = encoder.fit_transform(df[cat_cols])

# Combine All Features
X_final = hstack([X, X_dense_sparse, X_cat_sparse])

Original TF-IDF shape: (39100, 50000)
Numerical shape:       (39100, 16)
Categorical shape:     (39100, 13)
Final Combined shape:  (39100, 50029)


In [21]:
X_final

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 17233911 stored elements and shape (39100, 50029)>

In [22]:
import pickle

# Save
with open('../data/X_final.pkl', 'wb') as f:
    pickle.dump(X_final, f)

y = df['label'].values
with open('../data/labels.pkl', 'wb') as f:
    pickle.dump(y, f)

# Save fitted objects for later inference
with open('../data/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
with open('../data/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('../data/encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)


# # Load later
# with open('../data/X_final.pkl', 'rb') as f:
#     X_final = pickle.load(f)

In [9]:
from collections import Counter

fake_words = Counter(" ".join(df[df["label"]==0]["full_text"]).split())
real_words = Counter(" ".join(df[df["label"]==1]["full_text"]).split())

exclusive_fake = set(fake_words) - set(real_words)
exclusive_real = set(real_words) - set(fake_words)

# print("Fake sample words:", list(exclusive_fake)[:20])
# print("Real sample words :", list(exclusive_real)[:20])

print(len(exclusive_fake), "exclusive fake words")
print(len(exclusive_real), "exclusive real words")

42219 exclusive fake words
25178 exclusive real words


In [10]:
# Calculate frequencies of exclusive words
exclusive_fake_counts = {word: fake_words[word] for word in exclusive_fake}
exclusive_real_counts = {word: real_words[word] for word in exclusive_real}

# Sort by frequency (High to Low)
sorted_fake_exclusive = sorted(exclusive_fake_counts.items(), key=lambda x: x[1], reverse=True)
sorted_real_exclusive = sorted(exclusive_real_counts.items(), key=lambda x: x[1], reverse=True)

print("Top 50 Exclusive FAKE words:")
print(sorted_fake_exclusive[:50])

print("\nTop 50 Exclusive REAL words:")
print(sorted_real_exclusive[:50])

Top 50 Exclusive FAKE words:
[('wire', 1003), ('somodevilla', 442), ('filessupport', 297), ('mcnamee', 270), ('angerer', 264), ('finicum', 254), ('hilariously', 248), ('cking', 244), ('henningsen', 224), ('acr', 219), ('raedle', 186), ('whined', 182), ('wfb', 164), ('subscribing', 146), ('nyp', 146), ('behar', 143), ('blacklivesmatter', 137), ('hissy', 136), ('watters', 126), ('hesher', 125), ('camerota', 123), ('sarahpalinusa', 120), ('gitmo', 118), ('hammonds', 117), ('beyonc', 116), ('olbermann', 115), ('pirro', 114), ('stelter', 114), ('pee', 114), ('lavoy', 110), ('bullsh', 110), ('wikimedia', 110), ('cher', 109), ('philosophers', 107), ('uninterruptible', 107), ('dobbs', 105), ('spore', 104), ('masochists', 103), ('savants', 103), ('evangelists', 103), ('rascals', 103), ('moralists', 103), ('pizzagate', 101), ('gt', 99), ('wannabe', 95), ('flashback', 92), ('neocon', 92), ('wnd', 92), ('rino', 91), ('whoopi', 89)]

Top 50 Exclusive REAL words:
[('rakhine', 908), ('zuma', 692), ('

In [10]:
df.head()

Unnamed: 0,title,text,subject,date,label,repost_count,text_exclamation_count,text_capital_ratio,text_avg_sentence_length,title_exclamation_count,...,clean_title,clean_selftext,full_text,date_obj,month,day_of_week,month_sin,month_cos,day_sin,day_cos
0,Obama says must change the way nation manages ...,President Barack Obama on Tuesday said he woul...,Politics,2016-01-13,1,0,0,0.031654,23.5,0,...,obama says must change the way nation manages ...,president barack obama on tuesday said he woul...,obama says must change the way nation manages ...,2016-01-13,1.0,2.0,0.5,0.866025,0.974928,-0.222521
1,Obama says too many Americans feel 'the system...,President Barack Obama called on Tuesday for A...,Politics,2016-01-13,1,0,0,0.02008,25.5,0,...,obama says too many americans feel ' the syste...,president barack obama called on tuesday for a...,obama says too many americans feel ' the syste...,2016-01-13,1.0,2.0,0.5,0.866025,0.974928,-0.222521
2,Biden: Sanders has 'struck a chord with voters...,U.S. Vice President Joe Biden praised Democrat...,Politics,2016-01-13,1,0,0,0.046058,16.727273,0,...,biden : sanders has ' struck a chord with vote...,us . vice president joe biden praised democrat...,biden : sanders has ' struck a chord with vote...,2016-01-13,1.0,2.0,0.5,0.866025,0.974928,-0.222521
3,South Korea president says acquiring nukes wou...,If South Korea develops nuclear weapons in res...,Politics,2016-01-13,1,0,0,0.068536,16.5,0,...,south korea president says acquiring nukes wou...,if south korea develops nuclear weapons in res...,south korea president says acquiring nukes wou...,2016-01-13,1.0,2.0,0.5,0.866025,0.974928,-0.222521
4,South Korea president says China response to N...,China should play a key and proper role in res...,Politics,2016-01-13,1,0,0,0.056075,31.0,0,...,south korea president says china response to n...,china should play a key and proper role in res...,south korea president says china response to n...,2016-01-13,1.0,2.0,0.5,0.866025,0.974928,-0.222521


In [36]:
df['text_emotion'].value_counts()

text_emotion
fear        14782
anger        8913
neutral      8110
sadness      2645
disgust      1959
joy          1412
surprise     1284
Name: count, dtype: int64

In [23]:
df.columns

Index(['title', 'text', 'subject', 'date', 'label', 'repost_count',
       'text_exclamation_count', 'text_capital_ratio',
       'text_avg_sentence_length', 'title_exclamation_count',
       'title_capital_ratio', 'text_sentiment_polarity',
       'text_sentiment_subjectivity', 'title_sentiment_polarity',
       'title_sentiment_subjectivity', 'text_emotion', 'text_emotion_score',
       'title_emotion', 'title_emotion_score', 'clean_title', 'clean_selftext',
       'full_text'],
      dtype='object')