In [1]:
import pandas as pd

# Load both datasets
fake = pd.read_csv("Fake.csv")
true = pd.read_csv("True.csv")

# Add labels
fake['label'] = 'FAKE'
true['label'] = 'REAL'

# Combine them
data = pd.concat([fake, true], axis=0).reset_index(drop=True)

# Shuffle
data = data.sample(frac=1, random_state=42)

# Check data
print(data.shape)
data.head()

(44898, 5)


Unnamed: 0,title,text,subject,date,label
22216,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",FAKE
27917,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",REAL
25007,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",REAL
1377,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",FAKE
32476,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",REAL


In [2]:
import re
import string

def clean_text(text):
    text = text.lower()                          # Lowercase
    text = re.sub(r'\[.*?\]', '', text)          # Remove text in brackets
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>+', '', text)           # Remove HTML
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
    text = re.sub(r'\n', '', text)               # Remove newlines
    text = re.sub(r'\w*\d\w*', '', text)         # Remove words with numbers
    return text

data['text'] = data['title'] + " " + data['text']
data['text'] = data['text'].apply(clean_text)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X = data['text']
y = data['label']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tf = vectorizer.fit_transform(X_train)
X_test_tf = vectorizer.transform(X_test)


In [4]:
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

model = PassiveAggressiveClassifier(max_iter=1000)
model.fit(X_train_tf, y_train)

y_pred = model.predict(X_test_tf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
def predict_news(news_text):
    cleaned = clean_text(news_text)
    vector = vectorizer.transform([cleaned])
    prediction = model.predict(vector)
    return prediction[0]


Accuracy: 0.9942093541202672

Classification Report:
               precision    recall  f1-score   support

        FAKE       1.00      0.99      0.99      5863
        REAL       0.99      0.99      0.99      5362

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



In [5]:
def predict_news(news_text):
    cleaned = clean_text(news_text)
    vector = vectorizer.transform([cleaned])
    prediction = model.predict(vector)
    return prediction[0]


In [6]:
predict_news("Breaking: Aliens have landed in New York City!")


'FAKE'

In [7]:
import joblib
joblib.dump(model, "model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")
import joblib

# Save the trained model
joblib.dump(model, "model.pkl")

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, "vectorizer.pkl")
import joblib

# Save the trained model
joblib.dump(model, "model.pkl")

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, "vectorizer.pkl")


['vectorizer.pkl']

In [8]:
import joblib

# Save the trained model
joblib.dump(model, "model.pkl")

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, "vectorizer.pkl")


['vectorizer.pkl']