In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\phill\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\phill\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\phill\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [7]:
# Load both datasets
print("Loading datasets...")
try:
    fake_df = pd.read_csv(r'C:\Users\phill\Downloads\Dectection_System\data\Fake.csv')
    true_df = pd.read_csv(r'C:\Users\phill\Downloads\Dectection_System\data\True.csv')
    
    print("Fake dataset shape:", fake_df.shape)
    print("True dataset shape:", true_df.shape)
    
    # Display basic info about both datasets
    print("\nFake dataset info:")
    print(fake_df.info())
    print("\nTrue dataset info:")
    print(true_df.info())
    
    print("\nFake dataset columns:", fake_df.columns.tolist())
    print("True dataset columns:", true_df.columns.tolist())
    
except FileNotFoundError as e:
    print(f"Error loading files: {e}")
    print("Please ensure both 'Fake.csv' and 'True.csv' are in the current directory")
    # Create sample data for demonstration
    fake_df = pd.DataFrame({
        'title': ['Breaking: Shocking conspiracy theory exposed!'],
        'text': ['This unbelievable revelation will change everything you know!'],
        'subject': ['conspiracy'],
        'date': ['December 31, 2017']
    })
    true_df = pd.DataFrame({
        'title': ['Scientific study shows positive results'],
        'text': ['Researchers conducted a thorough analysis of the data.'],
        'subject': ['science'],
        'date': ['January 15, 2018']
    })

Loading datasets...
Fake dataset shape: (23481, 4)
True dataset shape: (21417, 4)

Fake dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB
None

True dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB
None

Fake dataset columns: ['title', 'text', 'subject', 'date']
True dataset columns: ['title', 'text', 'subject', 'date']


In [4]:
def explore_dataset(df, dataset_name):
    """Explore basic statistics of a dataset"""
    print(f"\n{'='*50}")
    print(f"EXPLORING {dataset_name.upper()} DATASET")
    print(f"{'='*50}")
    
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    
    print("\nMissing values:")
    print(df.isnull().sum())
    
    print("\nSubject distribution:")
    print(df['subject'].value_counts())
    
    print("\nSample titles:")
    for i, title in enumerate(df['title'].head(5)):
        print(f"{i+1}. {title}")
    
    # Basic text statistics
    df['text_length'] = df['text'].apply(lambda x: len(str(x)))
    df['title_length'] = df['title'].apply(lambda x: len(str(x)))
    df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))
    
    print(f"\nText length - Mean: {df['text_length'].mean():.0f}, Std: {df['text_length'].std():.0f}")
    print(f"Title length - Mean: {df['title_length'].mean():.0f}, Std: {df['title_length'].std():.0f}")
    print(f"Word count - Mean: {df['word_count'].mean():.0f}, Std: {df['word_count'].std():.0f}")

# Explore both datasets
explore_dataset(fake_df, "FAKE")
explore_dataset(true_df, "TRUE")


EXPLORING FAKE DATASET
Shape: (23481, 4)
Columns: ['title', 'text', 'subject', 'date']

Missing values:
title      0
text       0
subject    0
date       0
dtype: int64

Subject distribution:
subject
News               9050
politics           6841
left-news          4459
Government News    1570
US_News             783
Middle-east         778
Name: count, dtype: int64

Sample titles:
1.  Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing
2.  Drunk Bragging Trump Staffer Started Russian Collusion Investigation
3.  Sheriff David Clarke Becomes An Internet Joke For Threatening To Poke People ‘In The Eye’
4.  Trump Is So Obsessed He Even Has Obama’s Name Coded Into His Website (IMAGES)
5.  Pope Francis Just Called Out Donald Trump During His Christmas Speech

Text length - Mean: 2547, Std: 2533
Title length - Mean: 94, Std: 27
Word count - Mean: 423, Std: 408

EXPLORING TRUE DATASET
Shape: (21417, 4)
Columns: ['title', 'text', 'subject', 'date']

Missing values:

In [8]:
def enhanced_clean_text(text):
    """
    Enhanced text cleaning and preprocessing
    """
    if pd.isna(text):
        return ""
    
    # Convert to string and lowercase
    text = str(text).lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove social media elements
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # Remove special characters and numbers but keep basic punctuation for context
    text = re.sub(r'[^\w\s\.\,\!\?]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenize
    tokens = text.split()
    
    # Lemmatize and remove stopwords
    tokens = [lemmatizer.lemmatize(token) for token in tokens 
              if token not in stop_words and len(token) > 2]
    
    return ' '.join(tokens)

def extract_features(df):
    """
    Extract comprehensive features from the dataset
    """
    # Text cleaning
    df['cleaned_text'] = df['text'].apply(enhanced_clean_text)
    df['cleaned_title'] = df['title'].apply(enhanced_clean_text)
    
    # Combined features
    df['combined_text'] = df['cleaned_title'] + ' ' + df['cleaned_text']
    
    # Numerical features
    df['text_length'] = df['text'].apply(lambda x: len(str(x)))
    df['title_length'] = df['title'].apply(lambda x: len(str(x)))
    df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))
    df['avg_word_length'] = df['text'].apply(lambda x: np.mean([len(word) for word in str(x).split()]) 
                                            if len(str(x).split()) > 0 else 0)
    
    # Sentiment indicators (simple version)
    df['has_exclamation'] = df['title'].apply(lambda x: 1 if '!' in str(x) else 0)
    df['has_question'] = df['title'].apply(lambda x: 1 if '?' in str(x) else 0)
    df['title_uppercase_ratio'] = df['title'].apply(
        lambda x: sum(1 for c in str(x) if c.isupper()) / max(1, len(str(x)))
    )
    
    # Date features
    df['has_date'] = df['date'].apply(lambda x: 1 if pd.notna(x) and str(x).strip() != '' else 0)
    
    return df

print("Preprocessing fake dataset...")
fake_df = extract_features(fake_df)
fake_df['label'] = 0  # 0 for fake news

print("Preprocessing true dataset...")
true_df = extract_features(true_df)
true_df['label'] = 1  # 1 for true news

print("Preprocessing completed!")

Preprocessing fake dataset...
Preprocessing true dataset...
Preprocessing completed!


In [9]:
# Combine datasets
combined_df = pd.concat([fake_df, true_df], ignore_index=True)

print(f"Combined dataset shape: {combined_df.shape}")
print(f"Fake news count: {len(combined_df[combined_df['label'] == 0])}")
print(f"True news count: {len(combined_df[combined_df['label'] == 1])}")

# Check for any duplicates
print(f"Duplicate rows: {combined_df.duplicated().sum()}")

# Remove duplicates if any
combined_df = combined_df.drop_duplicates()
print(f"Dataset shape after removing duplicates: {combined_df.shape}")

Combined dataset shape: (44898, 16)
Fake news count: 23481
True news count: 21417
Duplicate rows: 209
Dataset shape after removing duplicates: (44689, 16)


In [10]:
# Save the cleaned dataset for use in separate files
import pandas as pd
import joblib

# Save the cleaned dataset
cleaned_data_path = r'C:\Users\phill\Downloads\Dectection_System\data\cleaned_news_dataset.csv'
combined_df.to_csv(cleaned_data_path, index=False)
print(f"✅ Cleaned dataset saved as: {cleaned_data_path}")

# Save the vectorizer and label encoder if needed
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
print("✅ TF-IDF vectorizer saved")

# Also save the train-test split indices for consistency
split_data = {
    'X_train': X_train.index.tolist(),
    'X_test': X_test.index.tolist(),
    'y_train': y_train.index.tolist(), 
    'y_test': y_test.index.tolist()
}
joblib.dump(split_data, 'train_test_split.pkl')
print("✅ Train-test split indices saved")

# Display dataset info
print(f"\n📊 CLEANED DATASET INFO:")
print(f"Total samples: {len(combined_df):,}")
print(f"Fake news: {len(combined_df[combined_df['label'] == 0]):,}")
print(f"True news: {len(combined_df[combined_df['label'] == 1]):,}")
print(f"Features: {combined_df.shape[1]}")
print(f"File size: {cleaned_data_path}")

✅ Cleaned dataset saved as: C:\Users\phill\Downloads\Dectection_System\data\cleaned_news_dataset.csv


NameError: name 'tfidf_vectorizer' is not defined