In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import pickle

In [2]:
# Download NLTK resources 
try:
    stopwords.words('english')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')

In [3]:
# Load original datasets
df_fake = pd.read_csv('Fake.csv')
df_true_original = pd.read_csv('True.csv')

# Load the new, diverse BBC dataset
df_bbc = pd.read_csv('BBC News Train.csv')

print("Original Fake News:", df_fake.shape)
print("Original Real News:", df_true_original.shape)
print("New BBC Real News:", df_bbc.shape)

Original Fake News: (23481, 4)
Original Real News: (21417, 4)
New BBC Real News: (1490, 3)


In [4]:
df_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [5]:
df_true_original.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [6]:
df_bbc.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [7]:
# Create a consistent 'content' column for all dataframes
df_fake['content'] = df_fake['title'] + ' ' + df_fake['text']
df_true_original['content'] = df_true_original['title'] + ' ' + df_true_original['text']
# The BBC dataset only has a 'Text' column, which we'll use as the content
df_bbc['content'] = df_bbc['Text'] 

In [8]:
# Add labels: 0 for Fake, 1 for Real
df_fake['label'] = 0
df_true_original['label'] = 1
df_bbc['label'] = 1 # All BBC articles are real news

In [9]:
# Combine all "true" news sources
df_true_combined = pd.concat([df_true_original[['content', 'label']], df_bbc[['content', 'label']]], ignore_index=True)

In [10]:
# Combine the final true and fake datasets
df_combined = pd.concat([df_fake[['content', 'label']], df_true_combined], ignore_index=True)

In [11]:
df_combined.dropna(subset=['content'], inplace=True)
df_combined.drop_duplicates(subset=['content'], inplace=True)

In [12]:
print("Combined Data Shape:", df_combined.shape)

Combined Data Shape: (40545, 2)


## BALANCE THE DATASET

In [13]:
print("\nValue counts before balancing:")
print(df_combined['label'].value_counts())


Value counts before balancing:
label
1    22637
0    17908
Name: count, dtype: int64


In [14]:
# Separate into fake and real
df_fake_final = df_combined[df_combined['label'] == 0]
df_true_final = df_combined[df_combined['label'] == 1]

In [15]:
# Sample the 'real' articles to match the number of 'fake' articles
df_true_sampled = df_true_final.sample(n=len(df_fake_final), random_state=42)

In [16]:
# Create the final, balanced dataset
df_balanced = pd.concat([df_fake_final, df_true_sampled], ignore_index=True)

In [17]:
# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [18]:
print("\n--- Data Balanced ---")
print("Final Balanced Data Shape:", df_balanced.shape)
print("Value counts after balancing:")
print(df_balanced['label'].value_counts())


--- Data Balanced ---
Final Balanced Data Shape: (35816, 2)
Value counts after balancing:
label
1    17908
0    17908
Name: count, dtype: int64


## PREPROCESSING, TRAINING

In [19]:
port_stem = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [20]:
def preprocess_text(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', str(content)).lower()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content.split() if not word in stop_words]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [21]:
df_balanced['content_processed'] = df_balanced['content'].apply(preprocess_text)
print("Preprocessing Complete.")

Preprocessing Complete.


In [22]:
# Vectorization and Training
X = df_balanced['content_processed'].values
y = df_balanced['label'].values

vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, stratify=y, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [24]:
# Evaluation
y_pred = model.predict(X_test)
print("\n--- New Model Evaluation ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


--- New Model Evaluation ---
Accuracy: 0.9837
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      3582
           1       0.98      0.99      0.98      3582

    accuracy                           0.98      7164
   macro avg       0.98      0.98      0.98      7164
weighted avg       0.98      0.98      0.98      7164



In [25]:
# Save the model and vectorizer
with open('model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)