data_preprocessing.py

# Import necessary libraries


In [1]:
import pandas as pd

# Load the datasets

In [2]:
fake_path = "data/original_data/fake.csv"
true_path = "data/original_data/true.csv"

fake_df = pd.read_csv(fake_path)
true_df = pd.read_csv(true_path)

# Add label column: Fake = 1, True = 0

In [3]:
fake_df['label'] = 1
true_df['label'] = 0

# Combine datasets

In [4]:
df = pd.concat([fake_df, true_df], axis=0).reset_index(drop=True)

# Step 1: Data Exploration

In [5]:
print("Dataset Overview:")
print(df.head())

Dataset Overview:
                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  label  
0  December 31, 2017      1  
1  December 31, 2017      1  
2  December 30, 2017      1  
3  December 29, 2017      1  
4  December 25, 2017      1  


In [6]:
print("\nClass Distribution:")
print(df['label'].value_counts())


Class Distribution:
label
1    23481
0    21417
Name: count, dtype: int64


In [7]:
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
title      0
text       0
subject    0
date       0
label      0
dtype: int64


In [8]:
print(df.tail(10))

                                                   title  \
44888  Mata Pires, owner of embattled Brazil builder ...   
44889  U.S., North Korea clash at U.N. forum over nuc...   
44890  U.S., North Korea clash at U.N. arms forum on ...   
44891  Headless torso could belong to submarine journ...   
44892  North Korea shipments to Syria chemical arms a...   
44893  'Fully committed' NATO backs new U.S. approach...   
44894  LexisNexis withdrew two products from Chinese ...   
44895  Minsk cultural hub becomes haven from authorities   
44896  Vatican upbeat on possibility of Pope Francis ...   
44897  Indonesia to buy $1.14 billion worth of Russia...   

                                                    text    subject  \
44888  SAO PAULO (Reuters) - Cesar Mata Pires, the ow...  worldnews   
44889  GENEVA (Reuters) - North Korea and the United ...  worldnews   
44890  GENEVA (Reuters) - North Korea and the United ...  worldnews   
44891  COPENHAGEN (Reuters) - Danish police said on T..

In [9]:
# Remove the specific word or phrase "(Reuters)" from the text column
df['text'] = df['text'].str.replace(r'^[^(]*\(Reuters\)\s*-\s*', '', regex=True)
print(df.tail(10))

                                                   title  \
44888  Mata Pires, owner of embattled Brazil builder ...   
44889  U.S., North Korea clash at U.N. forum over nuc...   
44890  U.S., North Korea clash at U.N. arms forum on ...   
44891  Headless torso could belong to submarine journ...   
44892  North Korea shipments to Syria chemical arms a...   
44893  'Fully committed' NATO backs new U.S. approach...   
44894  LexisNexis withdrew two products from Chinese ...   
44895  Minsk cultural hub becomes haven from authorities   
44896  Vatican upbeat on possibility of Pope Francis ...   
44897  Indonesia to buy $1.14 billion worth of Russia...   

                                                    text    subject  \
44888  Cesar Mata Pires, the owner and co-founder of ...  worldnews   
44889  North Korea and the United States clashed at a...  worldnews   
44890  North Korea and the United States accused each...  worldnews   
44891  Danish police said on Tuesday the size of a he..

# Step 2: Data Cleaning

In [10]:
df = df.drop_duplicates()

# Fill missing text with an empty string and convert to lowercase
df['text'] = df['text'].fillna('')

In [11]:
# Optional: Remove punctuation and stopwords (example with nltk)
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# import string
# stop_words = set(stopwords.words('english'))
# df['text'] = df['text'].apply(lambda x: ' '.join(
#     [word for word in word_tokenize(x) if word not in stop_words and word not in string.punctuation]
# ))


# Step 3: Train-Test Split


In [12]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42, stratify=train_data['label'])

In [13]:
# Check the sizes
print(f"\nTrain size: {len(train_data)}, Validation size: {len(val_data)}, Test size: {len(test_data)}")


Train size: 25025, Validation size: 6257, Test size: 13407


In [14]:
# Save the splits (optional)
train_data.to_csv("train.csv", index=False)
val_data.to_csv("val.csv", index=False)
test_data.to_csv("test.csv", index=False)