data_preprocessing.py

# Import necessary libraries


In [1]:
import pandas as pd

# Load the datasets

In [2]:
fake_path = "data/original_data/fake.csv"
true_path = "data/original_data/true.csv"
WELF_dataset_path = "data/original_data/WELFake_Dataset.csv"

fake_df = pd.read_csv(fake_path)
true_df = pd.read_csv(true_path)
WELF_dataset = pd.read_csv(WELF_dataset_path)

# Add label column: Fake = 0, True = 1

In [3]:
fake_df['label'] = 0
true_df['label'] = 1

# Remove documents that dont have a text from the WELF dataset

In [4]:
WELF_dataset = WELF_dataset.dropna(subset=['text'])

rearrange the labels in WELF to be in sync with our dataset

In [5]:
WELF_dataset['label'] = 1 - WELF_dataset['label']

# Leave only the title and text and label columns

In [6]:
fake_df = fake_df[['title', 'text', 'label']]
true_df = true_df[['title', 'text', 'label']]
WELF_dataset = WELF_dataset[['title', 'text', 'label']]

check how many documents are in the WELF dataset

In [7]:
print("size of WELF dataset: ", len(WELF_dataset))
print("size of fake dataset: ", len(fake_df))
print("size of true dataset: ", len(true_df))

size of WELF dataset:  72095
size of fake dataset:  23481
size of true dataset:  21417


check how many fake and true documents are in the WELF dataset

In [8]:
print("number of fake in WELF: ", len(WELF_dataset[WELF_dataset['label'] == 0]))
print("number of true in WELF: ", len(WELF_dataset[WELF_dataset['label'] == 1]))

number of fake in WELF:  37067
number of true in WELF:  35028


# Data Cleaning and analysis


check for each dataset how many times the frase (Reuters) appears in the text column for each label 

In [9]:
print("number of (Reuters) in true: ", true_df['text'].str.contains(r'^.*?\(Reuters\)\s*-\s*').sum())
print("number of (Reuters) in fake: ", fake_df['text'].str.contains(r'^.*?\(Reuters\)\s*-\s*').sum())
print("number of (Reuters) in WELF 'true' label: ", WELF_dataset[WELF_dataset['label'] == 1]['text'].str.contains(r'^.*?\(Reuters\)\s*-\s*').sum())
print("number of (Reuters) in WELF 'fake' label: ", WELF_dataset[WELF_dataset['label'] == 0]['text'].str.contains(r'^.*?\(Reuters\)\s*-\s*').sum())

number of (Reuters) in true:  21246
number of (Reuters) in fake:  0
number of (Reuters) in WELF 'true' label:  21256
number of (Reuters) in WELF 'fake' label:  0


remove (Reuters) from the text column in all the datasets

In [10]:
true_df['text'] = true_df['text'].str.replace(r'^.*?\(Reuters\)\s*-\s*', '', regex=True)
fake_df['text'] = fake_df['text'].str.replace(r'^.*?\(Reuters\)\s*-\s*', '', regex=True)
WELF_dataset['text'] = WELF_dataset['text'].str.replace(r'^.*?\(Reuters\)\s*-\s*', '', regex=True)

#print out the number of (Reuters) in the text column for each label
print("number of (Reuters) in true: ", true_df['text'].str.contains(r'^.*?\(Reuters\)\s*-\s*').sum())
print("number of (Reuters) in fake: ", fake_df['text'].str.contains(r'^.*?\(Reuters\)\s*-\s*').sum())
print("number of (Reuters) in WELF 'true' label: ", WELF_dataset[WELF_dataset['label'] == 1]['text'].str.contains(r'^.*?\(Reuters\)\s*-\s*').sum())
print("number of (Reuters) in WELF 'fake' label: ", WELF_dataset[WELF_dataset['label'] == 0]['text'].str.contains(r'^.*?\(Reuters\)\s*-\s*').sum())

number of (Reuters) in true:  1
number of (Reuters) in fake:  0
number of (Reuters) in WELF 'true' label:  1
number of (Reuters) in WELF 'fake' label:  0


conclusion: there is atext with the frase (Reuters) twice!
print the entry where the remaining (Reuters) is in the text column

In [11]:
print(true_df[true_df['text'].str.contains(r'^.*?\(Reuters\)\s*-\s*')])
print(WELF_dataset[WELF_dataset['text'].str.contains(r'^.*?\(Reuters\)\s*-\s*')])

                                                  title  \
5882  The Trump presidency on Jan 29 at 4:12 P.M. ES...   

                                                   text  label  
5882  Jan 29 (Reuters) - Highlights of the day for U...      1  
                                                   title  \
26543  The Trump presidency on Jan 29 at 4:12 P.M. ES...   

                                                    text  label  
26543  Jan 29 (Reuters) - Highlights of the day for U...      1  


remove the final (Reuters) from the text column in all the datasets

In [12]:
true_df['text'] = true_df['text'].str.replace(r'^.*?\(Reuters\)\s*-\s*', '', regex=True)
fake_df['text'] = fake_df['text'].str.replace(r'^.*?\(Reuters\)\s*-\s*', '', regex=True)
WELF_dataset['text'] = WELF_dataset['text'].str.replace(r'^.*?\(Reuters\)\s*-\s*', '', regex=True)

#print out the number of (Reuters) in the text column for each label
print("number of (Reuters) in true: ", true_df['text'].str.contains(r'^.*?\(Reuters\)\s*-\s*').sum())
print("number of (Reuters) in fake: ", fake_df['text'].str.contains(r'^.*?\(Reuters\)\s*-\s*').sum())
print("number of (Reuters) in WELF 'true' label: ", WELF_dataset[WELF_dataset['label'] == 1]['text'].str.contains(r'^.*?\(Reuters\)\s*-\s*').sum())
print("number of (Reuters) in WELF 'fake' label: ", WELF_dataset[WELF_dataset['label'] == 0]['text'].str.contains(r'^.*?\(Reuters\)\s*-\s*').sum())

number of (Reuters) in true:  0
number of (Reuters) in fake:  0
number of (Reuters) in WELF 'true' label:  0
number of (Reuters) in WELF 'fake' label:  0


Look for more frases that are exclusive to one of the labels

In [13]:
all_fake_word_check = pd.concat( [fake_df, WELF_dataset[WELF_dataset['label'] == 0]], axis=0).reset_index(drop=True)
all_true_word_check = pd.concat( [true_df, WELF_dataset[WELF_dataset['label'] == 1]], axis=0).reset_index(drop=True)

In [14]:
print("length of all_fake_word_check: ", len(all_fake_word_check))
print("length of all_true_word_check: ", len(all_true_word_check))

length of all_fake_word_check:  60548
length of all_true_word_check:  56445


In [15]:
#remove duplicates from the datasets
all_fake_word_check = all_fake_word_check.drop_duplicates()
all_true_word_check = all_true_word_check.drop_duplicates()


In [16]:
print("length of all_fake_word_check: ", len(all_fake_word_check))
print("length of all_true_word_check: ", len(all_true_word_check))

length of all_fake_word_check:  28848
length of all_true_word_check:  34791


In [17]:
from collections import Counter
import re
import numpy as np
from typing import Dict, List, Tuple
import pandas as pd

def preprocess_text(text: str, keep_symbols: bool = True) -> List[str]:
    """
    Preprocess text while preserving multilingual characters and optional symbols.
    
    Parameters:
        text: Input text
        keep_symbols: If True, keeps emoji and special symbols
    """
    if not isinstance(text, str):
        return []

    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    if keep_symbols:
        # Keep unicode characters (including emojis) and letters, remove unwanted characters
        # This pattern keeps all Unicode letters, numbers, emojis, and common symbols
        text = re.sub(r'[^\w\s\u0080-\u10FFFF\']+', ' ', text)
    else:
        # Remove special characters but keep apostrophes for contractions
        text = re.sub(r'[^a-zA-Z\']+', ' ', text)

    # Remove single quotes if they're not part of contractions
    text = re.sub(r'\s\'|\'\s|^\'|\'$', ' ', text)

    # Split into words and remove empty strings
    words = [word.strip() for word in text.split()]

    # Remove single-character words and empty strings
    words = [word for word in words if len(word) > 1]

    return words

def analyze_word_frequencies(fake_df: pd.DataFrame, true_df: pd.DataFrame,
                             text_column: str, min_frequency: int = 2,
                             keep_symbols: bool = True) -> Tuple[Dict, Dict, Dict, Dict]:
    """
    Analyze word frequencies in fake and true news articles.
    Returns word frequencies and article counts for both categories.
    """
    # Initialize Counters for word frequencies and article appearances
    fake_vocab = Counter()
    true_vocab = Counter()
    fake_article_counts = Counter()
    true_article_counts = Counter()

    # Process fake news
    for _, row in fake_df.iterrows():
        text = row[text_column]
        words = preprocess_text(text, keep_symbols)
        # Update total word frequency
        fake_vocab.update(words)
        # Update article counts (count each word only once per article)
        fake_article_counts.update(set(words))

    # Process true news
    for _, row in true_df.iterrows():
        text = row[text_column]
        words = preprocess_text(text, keep_symbols)
        # Update total word frequency
        true_vocab.update(words)
        # Update article counts (count each word only once per article)
        true_article_counts.update(set(words))

    # Find exclusive words (filtering by minimum frequency)
    fake_exclusive = {}
    true_exclusive = {}

    for word, count in fake_vocab.items():
        if word not in true_vocab and count >= min_frequency:
            fake_exclusive[word] = {
                'total_frequency': count,
                'article_count': fake_article_counts[word],
                'article_percentage': (fake_article_counts[word] / len(fake_df)) * 100
            }

    for word, count in true_vocab.items():
        if word not in fake_vocab and count >= min_frequency:
            true_exclusive[word] = {
                'total_frequency': count,
                'article_count': true_article_counts[word],
                'article_percentage': (true_article_counts[word] / len(true_df)) * 100
            }

    # Sort by total frequency
    fake_exclusive_sorted = dict(sorted(fake_exclusive.items(),
                                        key=lambda x: x[1]['total_frequency'],
                                        reverse=True))
    true_exclusive_sorted = dict(sorted(true_exclusive.items(),
                                        key=lambda x: x[1]['total_frequency'],
                                        reverse=True))

    return fake_vocab, true_vocab, fake_exclusive_sorted, true_exclusive_sorted

def print_analysis_results(fake_vocab: Dict, true_vocab: Dict,
                           fake_exclusive: Dict, true_exclusive: Dict,
                           n_words: int = 10):
    """
    Print detailed analysis results including article counts.
    """
    print("\nVocabulary Statistics:")
    print(f"Total unique words in fake news vocabulary: {len(fake_vocab):,}")
    print(f"Total unique words in true news vocabulary: {len(true_vocab):,}")
    print(f"Exclusive words in fake news: {len(fake_exclusive):,}")
    print(f"Exclusive words in true news: {len(true_exclusive):,}")

    print("\nTop exclusive words in fake news:")
    for word, stats in list(fake_exclusive.items())[:n_words]:
        print(f"'{word}': {stats['total_frequency']:,} total occurrences, "
              f"appears in {stats['article_count']:,} articles "
              f"({stats['article_percentage']:.1f}% of fake news articles)")

    print("\nTop exclusive words in true news:")
    for word, stats in list(true_exclusive.items())[:n_words]:
        print(f"'{word}': {stats['total_frequency']:,} total occurrences, "
              f"appears in {stats['article_count']:,} articles "
              f"({stats['article_percentage']:.1f}% of true news articles)")

def analyze_specific_terms(fake_df: pd.DataFrame, true_df: pd.DataFrame,
                           text_column: str, terms: List[str]):
    """
    Analyze frequency of specific terms in both datasets.
    """
    for term in terms:
        # Count occurrences
        true_count = true_df[text_column].str.count(term).sum()
        fake_count = fake_df[text_column].str.count(term).sum()

        # Count articles containing the term
        true_articles = true_df[text_column].str.contains(term, case=False).sum()
        fake_articles = fake_df[text_column].str.contains(term, case=False).sum()

        true_percentage = (true_articles / len(true_df)) * 100
        fake_percentage = (fake_articles / len(fake_df)) * 100

        print(f"\nTerm '{term}':")
        print(f"True news: {true_count:,} total occurrences, "
              f"appears in {true_articles:,} articles "
              f"({true_percentage:.2f}% of true news articles)")
        print(f"Fake news: {fake_count:,} total occurrences, "
              f"appears in {fake_articles:,} articles "
              f"({fake_percentage:.2f}% of fake news articles)")

# Example usage:
if __name__ == "__main__":
    # Assuming fake_df and true_df are your separated DataFrames
    fake_vocab, true_vocab, fake_exclusive, true_exclusive = analyze_word_frequencies(
        all_fake_word_check, all_true_word_check,
        text_column='text',
        min_frequency=2,
        keep_symbols=True  # Set to True to keep emojis and special characters
    )

    # Print general analysis
    print_analysis_results(fake_vocab, true_vocab, fake_exclusive, true_exclusive)

    # Analyze specific terms (can include emojis and non-English terms)
    terms_of_interest = ['trump', 'biden', 'covid', 'vaccine', '😊', 'señor', '中国']
    analyze_specific_terms(all_fake_word_check, all_true_word_check, 'text', terms_of_interest)


Vocabulary Statistics:
Total unique words in fake news vocabulary: 185,021
Total unique words in true news vocabulary: 127,253
Exclusive words in fake news: 38,083
Exclusive words in true news: 24,928

Top exclusive words in fake news:
'что': 1,423 total occurrences, appears in 148 articles (0.5% of fake news articles)
'не': 1,170 total occurrences, appears in 137 articles (0.5% of fake news articles)
'21wire': 1,117 total occurrences, appears in 552 articles (1.9% of fake news articles)
'quot': 1,012 total occurrences, appears in 13 articles (0.0% of fake news articles)
'по': 766 total occurrences, appears in 149 articles (0.5% of fake news articles)
'это': 727 total occurrences, appears in 101 articles (0.4% of fake news articles)
'как': 603 total occurrences, appears in 133 articles (0.5% of fake news articles)
'то': 482 total occurrences, appears in 105 articles (0.4% of fake news articles)
'somodevilla': 481 total occurrences, appears in 480 articles (1.7% of fake news articles)


# next part

# Combine datasets

In [18]:
df = pd.concat([fake_df, true_df,WELF_dataset], axis=0).reset_index(drop=True)

In [19]:
#remove duplicates from the datasets
df = df.drop_duplicates()

# Step 1: Data Exploration

In [20]:
print("Dataset Overview:")
print(df.head())

Dataset Overview:
                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text  label  
0  Donald Trump just couldn t wish all Americans ...      0  
1  House Intelligence Committee Chairman Devin Nu...      0  
2  On Friday, it was revealed that former Milwauk...      0  
3  On Christmas day, Donald Trump announced that ...      0  
4  Pope Francis used his annual Christmas Day mes...      0  


In [21]:
print("\nClass Distribution:")
print(df['label'].value_counts())


Class Distribution:
label
1    34791
0    28848
Name: count, dtype: int64


In [22]:
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
title    518
text       0
label      0
dtype: int64


In [23]:
print(df.tail(10))

                                                    title  \
116961  Review: ‘Rogue One’ Leaves ‘Star Wars’ Fans Wa...   
116963  Physician Aid in Dying Gains Acceptance in the...   
116966             Letting Go Of Old Patterns Of Reaction   
116967  Kris Kobach: Democrats Already Attacking Elect...   
116976  Tested by Russia, NATO Struggles to Stay Credi...   
116982  Dallas, Roger Federer, Hillary Clinton: Your F...   
116983  An Unlikely Contender Rises in France as the A...   
116985  Determined to kill: Can tough gun laws end mas...   
116990  Migrants Refuse To Leave Train At Refugee Camp...   

                                                     text  label  
116961  The great mystery of “Rogue One”  —   the big ...      1  
116963  Judith Katherine Dunning had been waiting anxi...      1  
116966  Leave a reply \nMary O’Malley – A friend of mi...      0  
116967    Kris Kobach of the Presidential Advisory Com...      1  
116976  BRUSSELS  —   Six weeks before a critical summ

# Step 2: Data Cleaning

In [24]:
df = df.drop_duplicates()

# Fill missing text with an empty string and convert to lowercase
df['text'] = df['text'].fillna('')

In [25]:
# Optional: Remove punctuation and stopwords (example with nltk)
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# import string
# stop_words = set(stopwords.words('english'))
# df['text'] = df['text'].apply(lambda x: ' '.join(
#     [word for word in word_tokenize(x) if word not in stop_words and word not in string.punctuation]
# ))


# Step 3: Train-Test Split
train - 70%, validation - 15%, test - 15%


In [26]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, test_size=0.15, random_state=42, stratify=df['label'])
train_data, val_data = train_test_split(train_data, test_size=0.1765, random_state=42, stratify=train_data['label'])

In [27]:
# Check the sizes
print(f"\nTrain size: {len(train_data)}, Validation size: {len(val_data)}, Test size: {len(test_data)}")


Train size: 44545, Validation size: 9548, Test size: 9546


In [28]:
# Save the splits
train_data.to_csv("train.csv", index=False)
val_data.to_csv("val.csv", index=False)
test_data.to_csv("test.csv", index=False)