In [12]:
import pandas as pd

# Load fake and real news CSVs
fake = pd.read_csv("Fake.csv")   # your local path to Fake.csv
real = pd.read_csv("True.csv")   # your local path to True.csv

# Add labels
fake['label'] = 'FAKE'
real['label'] = 'REAL'

# Combine datasets into one
data = pd.concat([fake, real]).reset_index(drop=True)

# Check first 5 rows
data.head()


Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",FAKE
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",FAKE
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",FAKE
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",FAKE
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",FAKE


In [3]:
fake = pd.read_csv("Fake.csv")
real = pd.read_csv("True.csv")


In [4]:
import nltk
from nltk.corpus import stopwords
import re

# Download stopwords once
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  # load stopwords once

# Combine title and text
data['content'] = data['title'] + " " + data['text']

# Function to clean text
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation and numbers
    words = text.split()
    words = [word for word in words if word not in stop_words]  # remove stopwords
    return " ".join(words)

# Apply cleaning
data['cleaned_content'] = data['content'].apply(clean_text)

# Check first 5 rows
data[['content', 'cleaned_content', 'label']].head()


[nltk_data] Downloading package stopwords to C:\Users\Smit
[nltk_data]     Patil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,content,cleaned_content,label
0,Donald Trump Sends Out Embarrassing New Year’...,donald trump sends embarrassing new years eve ...,FAKE
1,Drunk Bragging Trump Staffer Started Russian ...,drunk bragging trump staffer started russian c...,FAKE
2,Sheriff David Clarke Becomes An Internet Joke...,sheriff david clarke becomes internet joke thr...,FAKE
3,Trump Is So Obsessed He Even Has Obama’s Name...,trump obsessed even obamas name coded website ...,FAKE
4,Pope Francis Just Called Out Donald Trump Dur...,pope francis called donald trump christmas spe...,FAKE


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Features and labels
X = data['cleaned_content']
y = data['label']

# Stratified train/test split (keeps FAKE/REAL balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Check shape
print("X_train_tfidf shape:", X_train_tfidf.shape)
print("X_test_tfidf shape:", X_test_tfidf.shape)


X_train_tfidf shape: (35918, 193020)
X_test_tfidf shape: (8980, 193020)


In [6]:
from sklearn.linear_model import PassiveAggressiveClassifier

model = PassiveAggressiveClassifier(max_iter=50)
model.fit(X_train_tfidf, y_train)


0,1,2
,C,1.0
,fit_intercept,True
,max_iter,50
,tol,0.001
,early_stopping,False
,validation_fraction,0.1
,n_iter_no_change,5
,shuffle,True
,verbose,0
,loss,'hinge'


In [7]:
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialize the classifier
model = PassiveAggressiveClassifier(max_iter=50)
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9955456570155902
Confusion Matrix:
 [[4676   20]
 [  20 4264]]
Classification Report:
               precision    recall  f1-score   support

        FAKE       1.00      1.00      1.00      4696
        REAL       1.00      1.00      1.00      4284

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



In [8]:
def predict_news(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    cleaned_text = " ".join(words)
    text_tfidf = tfidf.transform([cleaned_text])
    prediction = model.predict(text_tfidf)
    return prediction[0]


In [9]:
sample_news = "Breaking news: Scientists discover a new planet in our solar system."
print("Prediction:", predict_news(sample_news))


Prediction: FAKE


In [11]:
# Widgets
import ipywidgets as widgets
from IPython.display import display

news_input = widgets.Textarea(
    value='',
    placeholder='Type news here...',
    description='News:',
    layout=widgets.Layout(width='80%', height='100px')
)
predict_button = widgets.Button(description="Predict")
output = widgets.Output()

def on_button_click(b):
    with output:
        output.clear_output()
        prediction = predict_news(news_input.value)
        print("Prediction:", prediction)

predict_button.on_click(on_button_click)
display(news_input, predict_button, output)
# Clear button
clear_button = widgets.Button(description="Clear")

def on_clear_click(b):
    news_input.value = ""  # clear text box
    output.clear_output()  # clear output

clear_button.on_click(on_clear_click)

display(clear_button)


Textarea(value='', description='News:', layout=Layout(height='100px', width='80%'), placeholder='Type news her…

Button(description='Predict', style=ButtonStyle())

Output()

Button(description='Clear', style=ButtonStyle())