In [1]:
import pandas as pd


In [2]:
fake_df = pd.read_csv("../data/raw/Fake.csv")


In [3]:
true_df = pd.read_csv("../data/raw/True.csv")


In [4]:
print(fake_df.shape)
print(true_df.shape)

fake_df.head()


(23481, 4)
(21417, 4)


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [5]:
print(fake_df.columns)
print(true_df.columns)


Index(['title', 'text', 'subject', 'date'], dtype='object')
Index(['title', 'text', 'subject', 'date'], dtype='object')


In [6]:
fake_df["label"] = 1
true_df["label"] = 0


In [7]:
fake_df.head()


Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [8]:
import pandas as pd

df = pd.concat([fake_df, true_df], axis=0)


In [9]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)


In [10]:
print(df.shape)
df.head()


(44898, 5)


Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",1
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",0
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",0
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",1
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",0


In [11]:
df.columns


Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')

In [12]:
import re


In [13]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text


In [14]:
df["clean_text"] = df["text"].apply(clean_text)


In [15]:
df[["text", "clean_text"]].head()


Unnamed: 0,text,clean_text
0,"21st Century Wire says Ben Stein, reputable pr...",st century wire says ben stein reputable profe...
1,WASHINGTON (Reuters) - U.S. President Donald T...,washington reuters us president donald trump ...
2,(Reuters) - Puerto Rico Governor Ricardo Rosse...,reuters puerto rico governor ricardo rossello...
3,"On Monday, Donald Trump once again embarrassed...",on monday donald trump once again embarrassed ...
4,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",glasgow scotland reuters most us presidential...


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [17]:
vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words="english"
)


In [18]:

X = vectorizer.fit_transform(df["clean_text"])
y = df["label"]



In [19]:
print(X.shape)
print(y.shape)


(44898, 5000)
(44898,)


In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [21]:
from sklearn.linear_model import LogisticRegression


In [23]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [24]:
y_pred = model.predict(X_test)


In [25]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9867483296213808


In [26]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.98      0.99      0.99      4270
           1       0.99      0.98      0.99      4710

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [27]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)


array([[4230,   40],
       [  79, 4631]], dtype=int64)

In [28]:
import joblib


In [30]:
import joblib

joblib.dump(model, "../models/fake_news_model.pkl")


['../models/fake_news_model.pkl']

In [31]:
joblib.dump(vectorizer, "../models/tfidf_vectorizer.pkl")


['../models/tfidf_vectorizer.pkl']

In [32]:
def explain_prediction(text, prediction):
    if prediction == 1:
        return (
            "This news article is likely FAKE because it contains emotionally "
            "charged language, lacks verified sources, and matches patterns "
            "commonly seen in misinformation."
        )
    else:
        return (
            "This news article is likely REAL because it uses factual language, "
            "follows a neutral tone, and resembles content from reliable news sources."
        )


In [33]:
sample_text = df["clean_text"].iloc[0]

sample_vector = vectorizer.transform([sample_text])
sample_pred = model.predict(sample_vector)[0]

print("Prediction:", "Fake" if sample_pred == 1 else "Real")
print("Explanation:", explain_prediction(sample_text, sample_pred))


Prediction: Fake
Explanation: This news article is likely FAKE because it contains emotionally charged language, lacks verified sources, and matches patterns commonly seen in misinformation.
