In [1]:
# Fake News Detection – Baseline Models

In [2]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [3]:
import pandas as pd
from pathlib import Path

# Function to load data safely
def load_data(file_path, fallback_path):
    try:
        if Path(file_path).exists():
            df = pd.read_csv(file_path, sep="\t")
        else:
            df = pd.read_csv(fallback_path, sep="\t")
    except Exception as e:
        print(f"Error loading data: {e}")
        raise

    # Ensure clean_statement exists and isn't empty
    if "clean_statement" not in df.columns:
        df["clean_statement"] = df.get("statement", "")
    df["clean_statement"] = df["clean_statement"].fillna("")  # Fill any NA values

    return df

# Load datasets
data_dir = "../data/"
processed_dir = data_dir + "processed/"
raw_dir = data_dir + "raw/"

train_df = load_data(processed_dir + "train_clean.tsv", raw_dir + "train.tsv")
valid_df = load_data(processed_dir + "valid_clean.tsv", raw_dir + "valid.tsv")
test_df = load_data(processed_dir + "test_clean.tsv", raw_dir + "test.tsv")

print("train shape:", train_df.shape)

# First check if 'label' column exists before accessing it
if 'label' in train_df.columns:
    print("labels:", train_df["label"].unique())
else:
    print("Warning: 'label' column not found in DataFrame. Available columns:", train_df.columns.tolist())

train_df.head(3)

train shape: (10239, 15)


Unnamed: 0,2635.json,false,Says the Annies List political group supports third-trimester abortions on demand.,abortion,dwayne-bohac,State representative,Texas,republican,0,1,0.1,0.2,0.3,a mailer,clean_statement
0,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,
1,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,
2,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,


In [4]:
print("Train columns:", train_df.columns.tolist())
print("Valid columns:", valid_df.columns.tolist())
print("Test columns:", test_df.columns.tolist())

Train columns: ['2635.json', 'false', 'Says the Annies List political group supports third-trimester abortions on demand.', 'abortion', 'dwayne-bohac', 'State representative', 'Texas', 'republican', '0', '1', '0.1', '0.2', '0.3', 'a mailer', 'clean_statement']
Valid columns: ['12134.json', 'barely-true', 'We have less Americans working now than in the 70s.', 'economy,jobs', 'vicky-hartzler', 'U.S. Representative', 'Missouri', 'republican', '1', '0', '1.1', '0.1', '0.2', 'an interview with ABC17 News', 'clean_statement']
Test columns: ['11972.json', 'true', 'Building a wall on the U.S.-Mexico border will take literally years.', 'immigration', 'rick-perry', 'Governor', 'Texas', 'republican', '30', '30.1', '42', '23', '18', 'Radio interview', 'clean_statement']


In [5]:
# Split Variables using the correct column names
# The label appears to be the second column (index 1) in each DataFrame
X_train = train_df['clean_statement']
y_train = train_df.iloc[:, 1]  # Get second column by position
X_valid = valid_df['clean_statement']
y_valid = valid_df.iloc[:, 1]
X_test = test_df['clean_statement']
y_test = test_df.iloc[:, 1]

# Verify the splits
print("\nData splits:")
print(f"Training samples: {len(X_train)} (Labels: {y_train.unique().tolist()})")
print(f"Validation samples: {len(X_valid)} (Labels: {y_valid.unique().tolist()})")
print(f"Test samples: {len(X_test)} (Labels: {y_test.unique().tolist()})")

# Show sample data
print("\nSample training data:")
print(pd.DataFrame({'text': X_train.head(3), 'label': y_train.head(3)}))


Data splits:
Training samples: 10239 (Labels: ['half-true', 'mostly-true', 'false', 'true', 'barely-true', 'pants-fire'])
Validation samples: 1283 (Labels: ['pants-fire', 'false', 'half-true', 'true', 'barely-true', 'mostly-true'])
Test samples: 1266 (Labels: ['false', 'half-true', 'pants-fire', 'true', 'barely-true', 'mostly-true'])

Sample training data:
  text        label
0         half-true
1       mostly-true
2             false


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer  # Fixed typo in "extraction"

# First verify your input data
print("Verifying input data...")
print(f"X_train sample: {X_train.iloc[0] if hasattr(X_train, 'iloc') else X_train[0]}")
print(f"X_train type: {type(X_train)}")

# TF-IDF Vectorization with error handling
try:
    tfidf = TfidfVectorizer(
        max_features=5000,
        ngram_range=(1, 2),
        stop_words="english",
        lowercase=True,
        analyzer='word',
        min_df=5,
        max_df=0.7
    )

    # Ensure data is in correct format (convert to list if needed)
    if hasattr(X_train, 'values'):
        X_train_text = X_train.values.astype('U')  # Convert to Unicode
    else:
        X_train_text = list(X_train)
    
    X_train_tfidf = tfidf.fit_transform(X_train_text)
    X_valid_tfidf = tfidf.transform(X_valid)
    X_test_tfidf = tfidf.transform(X_test)

    print("\nTF-IDF Vectorization Successful!")
    print("Train shape:", X_train_tfidf.shape)
    print("Validation shape:", X_valid_tfidf.shape)
    print("Test shape:", X_test_tfidf.shape)
    
    # Get feature names safely
    try:
        features = tfidf.get_feature_names_out()
        print(f"\nNumber of features: {len(features)}")
        print("Sample features:", features[:20])
    except AttributeError:
        # For older sklearn versions
        features = tfidf.get_feature_names()
        print(f"\nNumber of features: {len(features)}")
        print("Sample features:", features[:20])

except Exception as e:
    print(f"\nError during TF-IDF vectorization: {str(e)}")
    print("Troubleshooting steps:")
    print("1. Verify X_train contains text data")
    print("2. Check for None or NaN values in text")
    print("3. Ensure sklearn version >= 0.24.0 for get_feature_names_out()")
    print(f"Sample data check: {X_train[:1] if len(X_train) > 0 else 'Empty data!'}")

Verifying input data...
X_train sample: 
X_train type: <class 'pandas.core.series.Series'>

Error during TF-IDF vectorization: empty vocabulary; perhaps the documents only contain stop words
Troubleshooting steps:
1. Verify X_train contains text data
2. Check for None or NaN values in text
3. Ensure sklearn version >= 0.24.0 for get_feature_names_out()
Sample data check: 0    
Name: clean_statement, dtype: object


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer  # Fixed typo in "extraction"

# First verify your input data
print("Verifying input data...")
print(f"X_train sample: {X_train.iloc[0] if hasattr(X_train, 'iloc') else X_train[0]}")
print(f"X_train type: {type(X_train)}")

# TF-IDF Vectorization with error handling
try:
    tfidf = TfidfVectorizer(
        max_features=5000,
        ngram_range=(1, 2),
        stop_words="english",
        lowercase=True,
        analyzer='word',
        min_df=5,
        max_df=0.7
    )

    # Ensure data is in correct format (convert to list if needed)
    if hasattr(X_train, 'values'):
        X_train_text = X_train.values.astype('U')  # Convert to Unicode
    else:
        X_train_text = list(X_train)
    
    X_train_tfidf = tfidf.fit_transform(X_train_text)
    X_valid_tfidf = tfidf.transform(X_valid)
    X_test_tfidf = tfidf.transform(X_test)

    print("\nTF-IDF Vectorization Successful!")
    print("Train shape:", X_train_tfidf.shape)
    print("Validation shape:", X_valid_tfidf.shape)
    print("Test shape:", X_test_tfidf.shape)
    
    # Get feature names safely
    try:
        features = tfidf.get_feature_names_out()
        print(f"\nNumber of features: {len(features)}")
        print("Sample features:", features[:20])
    except AttributeError:
        # For older sklearn versions
        features = tfidf.get_feature_names()
        print(f"\nNumber of features: {len(features)}")
        print("Sample features:", features[:20])

except Exception as e:
    print(f"\nError during TF-IDF vectorization: {str(e)}")
    print("Troubleshooting steps:")
    print("1. Verify X_train contains text data")
    print("2. Check for None or NaN values in text")
    print("3. Ensure sklearn version >= 0.24.0 for get_feature_names_out()")
    print(f"Sample data check: {X_train[:1] if len(X_train) > 0 else 'Empty data!'}")

Verifying input data...
X_train sample: 
X_train type: <class 'pandas.core.series.Series'>

Error during TF-IDF vectorization: empty vocabulary; perhaps the documents only contain stop words
Troubleshooting steps:
1. Verify X_train contains text data
2. Check for None or NaN values in text
3. Ensure sklearn version >= 0.24.0 for get_feature_names_out()
Sample data check: 0    
Name: clean_statement, dtype: object


In [9]:
# 3. TF-IDF Vectorization - Robust Version
print("Creating TF-IDF features...")

# First check if input data exists and is valid
if len(X_train) == 0:
    raise ValueError("Training data is empty - cannot create features")

# Initialize vectorizer with robust parameters
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),          # Use both unigrams and bigrams
    stop_words='english',        # Remove common English words
    min_df=3,                    # Minimum 3 documents for a term
    max_df=0.85,                 # Maximum 85% document frequency
    lowercase=True,              # Convert to lowercase
    analyzer='word',             # Tokenize by words
    token_pattern=r'(?u)\b\w+\b' # Include apostrophes in words
)

try:
    # Ensure text data is properly formatted as strings
    X_train_text = X_train.astype(str).fillna('')  # Handle NaN values
    X_valid_text = X_valid.astype(str).fillna('')
    
    # Fit and transform training data
    X_train_tfidf = tfidf.fit_transform(X_train_text)
    
    # Check if vocabulary was created
    if len(tfidf.vocabulary_) == 0:
        # Fallback to simpler parameters if no features were generated
        print("Warning: No features generated - trying fallback parameters")
        tfidf = TfidfVectorizer(
            ngram_range=(1, 1),
            stop_words=None,
            min_df=1,
            max_df=1.0,
            lowercase=True
        )
        X_train_tfidf = tfidf.fit_transform(X_train_text)
    
    # Transform validation data
    X_valid_tfidf = tfidf.transform(X_valid_text)
    
    print(f"Successfully created {len(tfidf.vocabulary_)} features")
    print(f"Feature matrix shape: {X_train_tfidf.shape}")

except Exception as e:
    print(f"Error in TF-IDF vectorization: {str(e)}")
    # Provide troubleshooting suggestions
    print("Troubleshooting tips:")
    print("1. Check if X_train contains valid text data")
    print("2. Verify there are no empty strings or problematic characters")
    print("3. Try reducing min_df or increasing max_df parameters")
    print("4. Examine sample text:", X_train.head(3).values)
    raise

Creating TF-IDF features...


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [10]:
#Logistic Regression Evaluation
y_pred_valid = log_reg_cv.predict(X_valid_tfidf)

print("Logistic Regression Validation Results:")
print(classification_report(y_valid, y_pred_valid))

cm = confusion_matrix(y_valid, y_pred_valid, labels=log_reg_cv.classes_)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=log_reg_cv.classes_, yticklabels=log_reg_cv.classes_)
plt.title("Confusion Matrix - Logistic Regression")
plt.savefig("../results/confusion_logistic.png")
plt.show()


NameError: name 'log_reg_cv' is not defined

In [None]:
#Random Forest Evaluation
y_pred_valid_rf = rf_cv.predict(X_valid_tfidf)

print("Random Forest Validation Results:")
print(classification_report(y_valid, y_pred_valid_rf))

cm = confusion_matrix(y_valid, y_pred_valid_rf, labels=rf_cv.classes_)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Greens", xticklabels=rf_cv.classes_, yticklabels=rf_cv.classes_)
plt.title("Confusion Matrix - Random Forest")
plt.savefig("../results/confusion_rf.png")
plt.show()

In [None]:
#Test Evaluation (
print("===== Final Evaluation on Test Set =====")

print("\n[Logistic Regression]")
y_pred_test = log_reg_cv.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_test))

print("\n[Random Forest]")
y_pred_test_rf = rf_cv.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_test_rf))


In [None]:
#Save Models & Vectorize
# Save vectorizer and best models
pickle.dump(tfidf, open("../models/tfidf_vectorizer.pkl", "wb"))
pickle.dump(log_reg_cv.best_estimator_, open("../models/tfidf_logistic.pkl", "wb"))
pickle.dump(rf_cv.best_estimator_, open("../models/tfidf_rf.pkl", "wb"))

print("✅ Models and vectorizer saved to /models/")
