data_preprocessing.py

# Import necessary libraries


In [1]:
import pandas as pd

# Load the datasets

In [2]:
fake_path = "data/fake.csv"
true_path = "data/true.csv"
WELF_dataset_path = "data/WELFake_Dataset.csv"

fake_df = pd.read_csv(fake_path)
true_df = pd.read_csv(true_path)
WELF_dataset = pd.read_csv(WELF_dataset_path)

# Add label column: Fake = 1, True = 0

In [3]:
fake_df['label'] = 1
true_df['label'] = 0

# Remove documents that dont have a text from the WELF dataset

In [4]:
WELF_dataset = WELF_dataset.dropna(subset=['text'])

# Leave only the title and text and label columns

In [5]:
fake_df = fake_df[['title', 'text', 'label']]
true_df = true_df[['title', 'text', 'label']]
WELF_dataset = WELF_dataset[['title', 'text', 'label']]

# Combine datasets

In [6]:
df = pd.concat([fake_df, true_df,WELF_dataset], axis=0).reset_index(drop=True)

# Step 1: Data Exploration

In [7]:
print("Dataset Overview:")
print(df.head())

Dataset Overview:
                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text  label  
0  Donald Trump just couldn t wish all Americans ...      1  
1  House Intelligence Committee Chairman Devin Nu...      1  
2  On Friday, it was revealed that former Milwauk...      1  
3  On Christmas day, Donald Trump announced that ...      1  
4  Pope Francis used his annual Christmas Day mes...      1  


In [8]:
print("\nClass Distribution:")
print(df['label'].value_counts())


Class Distribution:
label
1    60548
0    56445
Name: count, dtype: int64


In [9]:
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
title    558
text       0
label      0
dtype: int64


In [10]:
print(df.tail(10))

                                                    title  \
116983  An Unlikely Contender Rises in France as the A...   
116984  WOW! JILL STEIN’S ‘FIRESIDE CHAT’ Exposes Her ...   
116985  Determined to kill: Can tough gun laws end mas...   
116986  WIKILEAKS EMAIL SHOWS CLINTON FOUNDATION FUNDS...   
116987  JUDGE JEANINE SOUNDS FREE SPEECH ALARM: “They ...   
116988  Russians steal research on Trump in hack of U....   
116989   WATCH: Giuliani Demands That Democrats Apolog...   
116990  Migrants Refuse To Leave Train At Refugee Camp...   
116991  Trump tussle gives unpopular Mexican leader mu...   
116992  Goldman Sachs Endorses Hillary Clinton For Pre...   

                                                     text  label  
116983  PARIS  —   In the age of Donald J. Trump, “Bre...      0  
116984                                                         1  
116985  The flag at Desert Hot Springs' Condor Gun Sho...      0  
116986  An email released by WikiLeaks on Sunday appea...   

In [11]:
# Remove the specific word or phrase "(Reuters)" from the text column
df['text'] = df['text'].str.replace(r'^[^(]*\(Reuters\)\s*-\s*', '', regex=True)
print(df.tail(10))

                                                    title  \
116983  An Unlikely Contender Rises in France as the A...   
116984  WOW! JILL STEIN’S ‘FIRESIDE CHAT’ Exposes Her ...   
116985  Determined to kill: Can tough gun laws end mas...   
116986  WIKILEAKS EMAIL SHOWS CLINTON FOUNDATION FUNDS...   
116987  JUDGE JEANINE SOUNDS FREE SPEECH ALARM: “They ...   
116988  Russians steal research on Trump in hack of U....   
116989   WATCH: Giuliani Demands That Democrats Apolog...   
116990  Migrants Refuse To Leave Train At Refugee Camp...   
116991  Trump tussle gives unpopular Mexican leader mu...   
116992  Goldman Sachs Endorses Hillary Clinton For Pre...   

                                                     text  label  
116983  PARIS  —   In the age of Donald J. Trump, “Bre...      0  
116984                                                         1  
116985  The flag at Desert Hot Springs' Condor Gun Sho...      0  
116986  An email released by WikiLeaks on Sunday appea...   

# Step 2: Data Cleaning

In [12]:
df = df.drop_duplicates()

# Fill missing text with an empty string and convert to lowercase
df['text'] = df['text'].fillna('')

In [13]:
# Optional: Remove punctuation and stopwords (example with nltk)
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# import string
# stop_words = set(stopwords.words('english'))
# df['text'] = df['text'].apply(lambda x: ' '.join(
#     [word for word in word_tokenize(x) if word not in stop_words and word not in string.punctuation]
# ))


# Step 3: Train-Test Split
train - 70%, validation - 15%, test - 15%


In [14]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, test_size=0.15, random_state=42, stratify=df['label'])
train_data, val_data = train_test_split(train_data, test_size=0.1765, random_state=42, stratify=train_data['label'])

In [15]:
# Check the sizes
print(f"\nTrain size: {len(train_data)}, Validation size: {len(val_data)}, Test size: {len(test_data)}")


Train size: 44545, Validation size: 9548, Test size: 9546


In [16]:
# Save the splits
train_data.to_csv("train.csv", index=False)
val_data.to_csv("val.csv", index=False)
test_data.to_csv("test.csv", index=False)