In [1]:
# Fake News Detection – Baseline Models

In [2]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [3]:
import pandas as pd
from pathlib import Path

# Function to load data safely
def load_data(file_path, fallback_path):
    try:
        if Path(file_path).exists():
            df = pd.read_csv(file_path, sep="\t")
        else:
            df = pd.read_csv(fallback_path, sep="\t")
    except Exception as e:
        print(f"Error loading data: {e}")
        raise

    # Ensure clean_statement exists and isn't empty
    if "clean_statement" not in df.columns:
        df["clean_statement"] = df.get("statement", "")
    df["clean_statement"] = df["clean_statement"].fillna("")  # Fill any NA values

    return df

# Load datasets
data_dir = "../data/"
processed_dir = data_dir + "processed/"
raw_dir = data_dir + "raw/"

train_df = load_data(processed_dir + "train_clean.tsv", raw_dir + "train.tsv")
valid_df = load_data(processed_dir + "valid_clean.tsv", raw_dir + "valid.tsv")
test_df = load_data(processed_dir + "test_clean.tsv", raw_dir + "test.tsv")

print("train shape:", train_df.shape)

# First check if 'label' column exists before accessing it
if 'label' in train_df.columns:
    print("labels:", train_df["label"].unique())
else:
    print("Warning: 'label' column not found in DataFrame. Available columns:", train_df.columns.tolist())

train_df.head(3)

train shape: (10239, 15)


Unnamed: 0,2635.json,false,Says the Annies List political group supports third-trimester abortions on demand.,abortion,dwayne-bohac,State representative,Texas,republican,0,1,0.1,0.2,0.3,a mailer,clean_statement
0,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,
1,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,
2,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,


In [4]:
print("Train columns:", train_df.columns.tolist())
print("Valid columns:", valid_df.columns.tolist())
print("Test columns:", test_df.columns.tolist())

Train columns: ['2635.json', 'false', 'Says the Annies List political group supports third-trimester abortions on demand.', 'abortion', 'dwayne-bohac', 'State representative', 'Texas', 'republican', '0', '1', '0.1', '0.2', '0.3', 'a mailer', 'clean_statement']
Valid columns: ['12134.json', 'barely-true', 'We have less Americans working now than in the 70s.', 'economy,jobs', 'vicky-hartzler', 'U.S. Representative', 'Missouri', 'republican', '1', '0', '1.1', '0.1', '0.2', 'an interview with ABC17 News', 'clean_statement']
Test columns: ['11972.json', 'true', 'Building a wall on the U.S.-Mexico border will take literally years.', 'immigration', 'rick-perry', 'Governor', 'Texas', 'republican', '30', '30.1', '42', '23', '18', 'Radio interview', 'clean_statement']


In [5]:
# Split Variables using the correct column names
# The label appears to be the second column (index 1) in each DataFrame
X_train = train_df['clean_statement']
y_train = train_df.iloc[:, 1]  # Get second column by position
X_valid = valid_df['clean_statement']
y_valid = valid_df.iloc[:, 1]
X_test = test_df['clean_statement']
y_test = test_df.iloc[:, 1]

# Verify the splits
print("\nData splits:")
print(f"Training samples: {len(X_train)} (Labels: {y_train.unique().tolist()})")
print(f"Validation samples: {len(X_valid)} (Labels: {y_valid.unique().tolist()})")
print(f"Test samples: {len(X_test)} (Labels: {y_test.unique().tolist()})")

# Show sample data
print("\nSample training data:")
print(pd.DataFrame({'text': X_train.head(3), 'label': y_train.head(3)}))


Data splits:
Training samples: 10239 (Labels: ['half-true', 'mostly-true', 'false', 'true', 'barely-true', 'pants-fire'])
Validation samples: 1283 (Labels: ['pants-fire', 'false', 'half-true', 'true', 'barely-true', 'mostly-true'])
Test samples: 1266 (Labels: ['false', 'half-true', 'pants-fire', 'true', 'barely-true', 'mostly-true'])

Sample training data:
  text        label
0         half-true
1       mostly-true
2             false
