In [7]:
# Install dependencies
!pip install -q scikit-learn pandas numpy joblib

# Upload your dataset files (True.csv and Fake.csv)
from google.colab import files
uploaded = files.upload()


Saving Fake.csv.zip to Fake.csv.zip
Saving True.csv.zip to True.csv.zip


In [9]:
import pandas as pd
import re
import zipfile
import os

# Load datasets
# Decompress zip files - Removed as files were likely uploaded directly
with zipfile.ZipFile('True.csv.zip', 'r') as zip_ref:
    zip_ref.extractall()
with zipfile.ZipFile('Fake.csv.zip', 'r') as zip_ref:
    zip_ref.extractall()

df_true = pd.read_csv('True.csv')
df_fake = pd.read_csv('Fake.csv')

# Add labels
df_true['label'] = 1   # Real news
df_fake['label'] = 0   # Fake news

df = pd.concat([df_true, df_fake], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)

# Simple cleaning: combine title and text, lowercase, remove URLs & special chars
def clean_text(s):
    s = str(s).lower()
    s = re.sub(r"http\S+|www\.\S+", " ", s)
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    return re.sub(r"\s+", " ", s).strip()

df['text'] = (df.get('title','') + " " + df.get('text','')).apply(clean_text)
df = df[['text', 'label']].dropna().reset_index(drop=True)
df.head()

Unnamed: 0,text,label
0,breaking gop chairman grassley has had enough ...,0
1,failed gop candidates remembered in hilarious ...,0
2,mike pence s new dc neighbors are hilariously ...,0
3,california ag pledges to defend birth control ...,1
4,az ranchers living on us mexico border destroy...,0


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42, stratify=df['label'])

# TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_df=0.8, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, y_train)
y_pred_lr = lr.predict(X_test_tfidf)
print("Logistic Regression\n", classification_report(y_test, y_pred_lr), "\nAccuracy:", accuracy_score(y_test, y_pred_lr))

# Train Passive-Aggressive Classifier
pa = PassiveAggressiveClassifier(max_iter=1000)
pa.fit(X_train_tfidf, y_train)
y_pred_pa = pa.predict(X_test_tfidf)
print("Passive-Aggressive\n", classification_report(y_test, y_pred_pa), "\nAccuracy:", accuracy_score(y_test, y_pred_pa))


Logistic Regression
               precision    recall  f1-score   support

           0       0.99      0.98      0.99      4696
           1       0.98      0.99      0.99      4284

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980
 
Accuracy: 0.9856347438752784
Passive-Aggressive
               precision    recall  f1-score   support

           0       1.00      0.99      0.99      4696
           1       0.99      1.00      0.99      4284

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980
 
Accuracy: 0.994543429844098


In [11]:
import joblib
joblib.dump(tfidf, 'tfidf.joblib')
joblib.dump(lr, 'logreg.joblib')
joblib.dump(pa, 'pa.joblib')


['pa.joblib']

In [12]:
# Demo predictions
sample_text = "your news text here..."
vec = tfidf.transform([clean_text(sample_text)])
print("LR predicts:", lr.predict(vec))
print("PA predicts:", pa.predict(vec))


LR predicts: [0]
PA predicts: [0]


In [14]:
# Load the saved models and vectorizer
tfidf_loaded = joblib.load('tfidf.joblib')
lr_loaded = joblib.load('logreg.joblib')
pa_loaded = joblib.load('pa.joblib')

# Function to preprocess new text and make predictions
def predict_news_authenticity(news_text):
    # Clean the input text using the same function as before
    cleaned_text = clean_text(news_text)

    # Vectorize the cleaned text using the loaded TF-IDF vectorizer
    vec = tfidf_loaded.transform([cleaned_text])

    # Make predictions using both models
    lr_prediction = lr_loaded.predict(vec)
    pa_prediction = pa_loaded.predict(vec)

    # Interpret the predictions (0 for fake, 1 for real)
    lr_result = "Real" if lr_prediction[0] == 1 else "Fake"
    pa_result = "Real" if pa_prediction[0] == 1 else "Fake"

    print(f"Logistic Regression Prediction: {lr_result}")
    print(f"Passive-Aggressive Prediction: {pa_result}")

# Example usage with a new news headline/text
new_article_text = "BREAKING: hello!"
predict_news_authenticity(new_article_text)

new_article_text_2 = "China and India is now standing against America."
predict_news_authenticity(new_article_text_2)

Logistic Regression Prediction: Fake
Passive-Aggressive Prediction: Fake
Logistic Regression Prediction: Fake
Passive-Aggressive Prediction: Fake


In [15]:
# Example of text likely to be classified as real news
real_news_example = "WASHINGTON (Reuters) - The United States and China are set to resume trade talks next week, according to sources familiar with the matter."
predict_news_authenticity(real_news_example)

Logistic Regression Prediction: Real
Passive-Aggressive Prediction: Real
