In [1]:
pip install pandas nltk


Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nishc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nishc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\nishc\AppData\Roaming\nltk_data...


True

In [4]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:

RAW_PATH = "data/FakeNewsNet.csv"   # adjust path if needed
df = pd.read_csv(RAW_PATH)

print("Shape before cleaning:", df.shape)
df.head()


Shape before cleaning: (23196, 5)


Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


In [None]:

df = df[['title', 'real', 'tweet_num']].copy()
df.rename(columns={'real':'label'}, inplace=True)  # 1 = real, 0 = fake

print("Nulls before:", df.isnull().sum().to_dict())
print("Duplicates before:", df.duplicated().sum())
df.sample(3)


Nulls before: {'title': 0, 'label': 0, 'tweet_num': 0}
Duplicates before: 170


Unnamed: 0,title,label,tweet_num
5034,Eva Mendes Tells All About Being A Working Mom...,1,68
9764,Behind The Scenes From Matt Hunter and Lele Po...,1,43
16092,All the Surprise Songs Taylor Swift Has Perfor...,0,5


In [None]:

df = df.dropna(subset=['title', 'label'])
df = df.drop_duplicates()

print("Nulls after:", df.isnull().sum().to_dict())
print("Duplicates after:", df.duplicated().sum())
print("Shape after basic cleaning:", df.shape)
df['label'].value_counts()


Nulls after: {'title': 0, 'label': 0, 'tweet_num': 0}
Duplicates after: 0
Shape after basic cleaning: (23026, 3)


label
1    17385
0     5641
Name: count, dtype: int64

In [None]:

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text: str) -> str:
    text = str(text).lower()
    text = re.sub(r'http\S+|www\.\S+', ' ', text)   # remove URLs
    text = re.sub(r'<.*?>', ' ', text)              # remove HTML
    text = re.sub(r'[@#]\w+', ' ', text)            # remove @mentions & #hashtags
    text = re.sub(r'[^a-z\s]', ' ', text)           # keep only letters
    tokens = []
    for w in text.split():
        if w in stop_words or len(w) < 3:
            continue
        tokens.append(lemmatizer.lemmatize(w))
    return " ".join(tokens)


In [None]:

df['clean_title'] = df['title'].apply(clean_text)

# Drop empty rows after cleaning
empty_mask = df['clean_title'].str.len() == 0
print("Empty rows after cleaning:", int(empty_mask.sum()))
df = df[~empty_mask].reset_index(drop=True)

df[['title','clean_title','label']].head(10)


Empty rows after cleaning: 9


Unnamed: 0,title,clean_title,label
0,Kandi Burruss Explodes Over Rape Accusation on...,kandi burruss explodes rape accusation real ho...,1
1,People's Choice Awards 2018: The best red carp...,people choice award best red carpet look,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,sophia bush sends sweet birthday message one t...,1
3,Colombian singer Maluma sparks rumours of inap...,colombian singer maluma spark rumour inappropr...,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,gossip girl year later upper east siders shock...,1
5,Gwen Stefani Got Dumped by Blake Shelton Over ...,gwen stefani got dumped blake shelton jealousy...,0
6,Broward County Sheriff Fired For Lying About P...,broward county sheriff fired lying parkland,0
7,Amber Rose Shuts Down French Montana Dating Ru...,amber rose shuts french montana dating rumor c...,0
8,Mindy Kaling makes first post-baby appearance ...,mindy kaling make first post baby appearance d...,1
9,Katharine McPhee Butchers Tony Nominations: “I...,katharine mcphee butcher tony nomination drinking,1


In [12]:

print("Final shape:", df.shape)
print("Label distribution:\n", df['label'].value_counts(normalize=True).round(3))
print("Avg. words per title:", df['clean_title'].str.split().str.len().mean())


Final shape: (23017, 4)
Label distribution:
 label
1    0.755
0    0.245
Name: proportion, dtype: float64
Avg. words per title: 7.851153495242647


In [13]:

CLEAN_PATH = "data/cleaned_fakenews.csv"
df[['title','clean_title','label','tweet_num']].to_csv(CLEAN_PATH, index=False)
print("Saved ✅ ->", CLEAN_PATH)


Saved ✅ -> data/cleaned_fakenews.csv
