In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split

# --- Load the datasets ---
# Assuming files are uploaded to your Colab session storage
try:
    df_fake = pd.read_csv('Fake.csv')
    df_true = pd.read_csv('True.csv')
    print("✅ DataFrames loaded successfully.")
except FileNotFoundError:
    print("❌ Error: Ensure 'Fake.csv' and 'True.csv' are uploaded to your Colab environment.")

# Display initial information
print(f"Fake News Count: {len(df_fake)}")
print(f"True News Count: {len(df_true)}")

✅ DataFrames loaded successfully.
Fake News Count: 23481
True News Count: 21417


In [2]:
# Add label column
df_fake['label'] = 0
df_true['label'] = 1

# Combine datasets and shuffle the order for good measure
df_combined = pd.concat([df_fake, df_true], ignore_index=True, sort=False)
df_combined = df_combined.sample(frac=1).reset_index(drop=True) # Shuffle data

print("✅ DataFrames combined and labeled.")
print("Combined DataFrame Sample:")
print(df_combined.head(3))

✅ DataFrames combined and labeled.
Combined DataFrame Sample:
                                               title  \
0  Malaysia ready to provide temporary shelter fo...   
1  Exclusive: Cameroonian troops entered Nigeria ...   
2  INTEL CHAIR Stands Up To Dems Calling For His ...   

                                                text          subject  \
0  KUALA LUMPUR (Reuters) - Malaysia s coast guar...        worldnews   
1  ABUJA/DAKAR (Reuters) - Cameroonian troops thi...        worldnews   
2  .@Rep_DevinNunes:  I m sure that @TheDemocrats...  Government News   

                 date  label  
0  September 8, 2017       1  
1  December 20, 2017       1  
2        Mar 27, 2017      0  


In [3]:
# Step 2: Drop unused columns and create 'full_text' feature
df_combined = df_combined.drop(['subject', 'date'], axis=1)

# Combine 'title' and 'text' to form the feature the model will analyze
df_combined['full_text'] = df_combined['title'] + ' ' + df_combined['text']

# Step 3: Define and apply the text cleaning function
def clean_text(text):
    # 1. Lowercase
    text = text.lower()
    # 2. Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    # 3. Remove punctuation and numbers (keeping only letters and spaces)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    return text

df_combined['cleaned_text'] = df_combined['full_text'].apply(clean_text)

print("✅ Text feature created and cleaned.")
print("\nComparison (Original vs. Cleaned):")
print(df_combined[['full_text', 'cleaned_text']].iloc[0].T)

✅ Text feature created and cleaned.

Comparison (Original vs. Cleaned):
full_text       Malaysia ready to provide temporary shelter fo...
cleaned_text    malaysia ready to provide temporary shelter fo...
Name: 0, dtype: object


In [4]:
# Step 4: Separate Features (X) and Target (y)
X = df_combined['cleaned_text'] # Input features (cleaned text)
y = df_combined['label']      # Target variable (0 or 1)

# Step 5: Split the data (80% Training, 20% Testing)
# stratify=y ensures the 0/1 ratio is maintained in both train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\n✅ Final Data Split Status:")
print(f"Total Samples: {len(df_combined)}")
print(f"Training Samples: {len(X_train)} (80%)")
print(f"Testing Samples: {len(X_test)} (20%)")


✅ Final Data Split Status:
Total Samples: 44898
Training Samples: 35918 (80%)
Testing Samples: 8980 (20%)


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
# max_features=10000 ensures we only use the 10,000 most common words, reducing complexity
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)

# Fit the vectorizer ONLY on the training data (X_train)
X_train_vectorized = vectorizer.fit_transform(X_train)

# Transform the test data using the fitted vectorizer
X_test_vectorized = vectorizer.transform(X_test)

print("✅ TF-IDF Vectorization complete.")
print(f"Shape of Training Matrix (Samples, Features): {X_train_vectorized.shape}")
print(f"Shape of Testing Matrix (Samples, Features): {X_test_vectorized.shape}")

✅ TF-IDF Vectorization complete.
Shape of Training Matrix (Samples, Features): (35918, 10000)
Shape of Testing Matrix (Samples, Features): (8980, 10000)


In [6]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the model
model = LogisticRegression(random_state=42)

print("⏳ Training Logistic Regression Model...")
model.fit(X_train_vectorized, y_train)
print("✅ Model training complete.")

⏳ Training Logistic Regression Model...
✅ Model training complete.


In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Make predictions on the test data
y_pred = model.predict(X_test_vectorized)

# Calculate key performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("\n--- Model Evaluation Results ---")
print(f"Accuracy: {accuracy:.4f} (Overall Correctness)")
print(f"Precision: {precision:.4f} (When model predicts REAL, how often is it right?)")
print(f"Recall: {recall:.4f} (Out of all REAL news, how many did the model find?)")
print(f"F1 Score: {f1:.4f} (Balance between Precision and Recall)")
print("\nConfusion Matrix (True vs Predicted):")
print(conf_matrix)


--- Model Evaluation Results ---
Accuracy: 0.9899 (Overall Correctness)
Precision: 0.9884 (When model predicts REAL, how often is it right?)
Recall: 0.9904 (Out of all REAL news, how many did the model find?)
F1 Score: 0.9894 (Balance between Precision and Recall)

Confusion Matrix (True vs Predicted):
[[4646   50]
 [  41 4243]]


In [8]:
import pickle
import os

# Define the filenames for your saved assets
# These files will go into your local newsguard-ai/models/ folder
model_filename = 'fake_news_model.pkl'
vectorizer_filename = 'fake_news_vectorizer.pkl'

# --- Save the Trained Model ---
with open(model_filename, 'wb') as file:
    pickle.dump(model, file)
print(f"✅ Trained Model saved as: {model_filename}")

# --- Save the Fitted Vectorizer (REQUIRED for new predictions) ---
with open(vectorizer_filename, 'wb') as file:
    pickle.dump(vectorizer, file)
print(f"✅ Fitted Vectorizer saved as: {vectorizer_filename}")

# IMPORTANT: You must manually download these two files
# (fake_news_model.pkl and fake_news_vectorizer.pkl)
# from the Colab file pane (left-hand sidebar) to your local machine!

✅ Trained Model saved as: fake_news_model.pkl
✅ Fitted Vectorizer saved as: fake_news_vectorizer.pkl
