# Data Preprocessing

This notebook performs data cleaning and text preprocessing on raw news articles to prepare them for feature extraction and model training.

The preprocessing pipeline focuses on noise removal, text normalization, and consistency across the dataset. 


In [1]:
import pandas as pd
import numpy as np
import re
import string

In [2]:
df_fake = pd.read_csv("../data/Fake.csv")
df_real = pd.read_csv("../data/True.csv")

In [3]:
df_fake["label"] = 0
df_real["label"] = 1

In [4]:
data =  pd.concat([df_fake, df_real])
data = data.sample(frac=1)
data = data.reset_index(drop=True)


In [5]:
data = data[["text", "label"]]

In [6]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]','', text)
    text = re.sub(r'https?://\S+|www\.\S+','', text)
    text = re.sub(r'<.*?>+','', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation),'', text)
    text = re.sub(r'\n','',text)
    text = re.sub(r'\w*\d\w*','',text)
    return  text

In [7]:
data["text"] = data["text"].apply(clean_text)

In [8]:
data["label"].value_counts()

label
0    23481
1    21417
Name: count, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
X = data["text"]
y = data["label"]
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.2, random_state = 42)