In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd

def load_fakenewsnet_data():
    # Load all CSVs
    gc_fake = pd.read_csv('/content/drive/MyDrive/FakeNewsNet/dataset/gossipcop_fake.csv')
    gc_real = pd.read_csv('/content/drive/MyDrive/FakeNewsNet/dataset/gossipcop_real.csv')
    pf_fake = pd.read_csv('/content/drive/MyDrive/FakeNewsNet/dataset/politifact_fake.csv')
    pf_real = pd.read_csv('/content/drive/MyDrive/FakeNewsNet/dataset/politifact_real.csv')

    # Add domain and label columns
    gc_fake['domain'] = 'gossipcop'
    gc_real['domain'] = 'gossipcop'
    pf_fake['domain'] = 'politifact'
    pf_real['domain'] = 'politifact'

    gc_fake['label'] = 0
    gc_real['label'] = 1
    pf_fake['label'] = 0
    pf_real['label'] = 1

    # Combine into a single DataFrame
    df = pd.concat([gc_fake, gc_real, pf_fake, pf_real], ignore_index=True)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    return df


In [9]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Clean a single text string
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # remove non-alphabetic characters
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and len(w) > 2]
    return " ".join(tokens)

# Add NLP features like sentiment and length
def add_nlp_features(df):
    df['clean_title'] = df['title'].apply(clean_text)

    if 'text' in df.columns:
        df['clean_text'] = df['text'].apply(clean_text)
        df['text_len'] = df['clean_text'].apply(lambda x: len(x.split()))
        df['text_sentiment'] = df['clean_text'].apply(lambda x: TextBlob(x).sentiment.polarity)
    else:
        df['text_len'] = df['clean_title'].apply(lambda x: len(x.split()))
        df['text_sentiment'] = df['clean_title'].apply(lambda x: TextBlob(x).sentiment.polarity)

    # Optional: parse publish date
    if 'publish_date' in df.columns:
        df['publish_date'] = pd.to_datetime(df['publish_date'], errors='coerce')
        df['publish_year'] = df['publish_date'].dt.year
        df['publish_month'] = df['publish_date'].dt.month

    return df


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [10]:
df = load_fakenewsnet_data()
df = add_nlp_features(df)

print("Cleaned Columns:", df.columns.tolist())
print(df[['label', 'clean_title', 'text_len', 'text_sentiment']].head())

# Optional: save in Colab
df.to_csv('cleaned_fakenewsnet.csv', index=False)
print("✅ Cleaned dataset saved to: cleaned_fakenewsnet.csv")


Cleaned Columns: ['id', 'news_url', 'title', 'tweet_ids', 'domain', 'label', 'clean_title', 'text_len', 'text_sentiment']
   label                                        clean_title  text_len  \
0      0  bindi irwin get married boyfriend chandler powell         7   
1      1       bob harper howard stern reached heart attack         7   
2      1  guardian galaxy vol cast play guess guardian j...        10   
3      1                       xfiles scully whisper mulder         4   
4      1  today rating show replacing matt lauer hoda ko...        10   

   text_sentiment  
0        0.250000  
1        0.000000  
2        0.136364  
3        0.000000  
4        0.500000  
✅ Cleaned dataset saved to: cleaned_fakenewsnet.csv


In [11]:
from google.colab import files
files.download('cleaned_fakenewsnet.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>