This is a binary classification model for categorizing Real and Fake news as 0 and 1.
The aim is to make a model which can generalize unseen articles accurately.

This data set contains two tables. Real and Fake having 4 columns (title,text,subject and date)
The label column needs to be added in order to distinguish fake(1) and real(0)

The date column is irrelevant for the process of classification

# Import and Check Dataset

In [None]:
import pandas as pd

In [None]:
#Load DS
fakeds = pd.read_csv('/content/Fake.csv')
realds = pd.read_csv('/content/True.csv')

In [None]:
fakeds['label'] = 1
realds['label'] = 0
#Label

In [None]:
#Combine the datasets
df = pd.concat([fakeds,realds], ignore_index=True)

In [None]:
#Shuffle the dataset for random distribution of fake and real news across training and test sets.
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
print("Dataset Shape : ", df.shape)
print("Columns : ", df.columns.tolist())
print("\nClass Distribution:\n", df['label'].value_counts())

Dataset Shape :  (44898, 5)
Columns :  ['title', 'text', 'subject', 'date', 'label']

Class Distribution:
 label
1    23481
0    21417
Name: count, dtype: int64


In [None]:
#Check for missing values
print("\nMissing values:\n", df.isnull().sum())
#Check for Duplicates
print("\nDuplicate entries:", df.duplicated().sum())


Missing values:
 title      0
text       0
subject    0
date       0
label      0
dtype: int64

Duplicate entries: 209


In [None]:
df.drop_duplicates(inplace=True)

In [None]:
print("\nDuplicate entries:", df.duplicated().sum())


Duplicate entries: 0


In [None]:
print("\nSample rows:\n", df[['title', 'text', 'label']].head())


Sample rows:
                                                title  \
0  Ben Stein Calls Out 9th Circuit Court: Committ...   
1  Trump drops Steve Bannon from National Securit...   
2  Puerto Rico expects U.S. to lift Jones Act shi...   
3   OOPS: Trump Just Accidentally Confirmed He Le...   
4  Donald Trump heads for Scotland to reopen a go...   

                                                text  label  
0  21st Century Wire says Ben Stein, reputable pr...      1  
1  WASHINGTON (Reuters) - U.S. President Donald T...      0  
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...      0  
3  On Monday, Donald Trump once again embarrassed...      1  
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...      0  


# Text Preprocessing

In [None]:
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
#Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
#Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
#Combine title and text
df['content'] = df['title'] + ' ' + df['text']

In [None]:
#Clean Text
def preprocess_text(text):
  #lowercase everything
  text = text.lower()
  #Remove URLS
  text = re.sub(r'http\S+|www.\S+','',text)
  #Remove HTML tags
  text = re.sub(r'<.*?>', '', text)
  #Remove punctuations and digits
  text = re.sub(r'[^a-z\s]', '', text)
  #Tokenize
  tokens = nltk.word_tokenize(text)
  #Remove stop words and lemmatize
  clean_tokens = [lemmatizer.lemmatize(word)
                  for word in tokens
                  if word not in stop_words]
  return ' '.join(clean_tokens)
  #Returns cleaned lemmatized tokens into a single string with spaces

In [None]:
#Apply cleaning to dataset
df['Clean_Content']= df['content'].apply(preprocess_text)

In [None]:
print(df[['content','Clean_Content']].sample(3))

                                                 content  \
31231  Sanders' TPP stance shot down by Democrats' pl...   
289    Italy government wins all five confidence vote...   
225    In U.S. presidential first, Trump prays at Jer...   

                                           Clean_Content  
31231  sander tpp stance shot democrat platform commi...  
289    italy government win five confidence vote elec...  
225    u presidential first trump prays jerusalem wes...  


# Feature Extraction

Transform preprocessed text into numerical feature vectors using
1) Bag of Words - Counts word occurences
2) TF-IDF (Term Frequency- Inverse Document Frequency) - Emphasized unique, informative words

Here we are using TF-IDF as it highlights more meaningful terms and penalizes overused and common words

1) Term Frequency - How oftem a word appears in a document
2) Inverse Document Frequency - Measures how rare a word is accross documents (Common words get lower score and rare words get higher score)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
#Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=8000)

In [None]:
X = tfidf.fit_transform(df['Clean_Content'])
#X will be a sparse matrix of shape (num_articles, num_words)

In [None]:
y = df['label']
#Contains the labels 0 for real 1 for fake

In [None]:
#Training and testing split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
#stratify=y: ensures both fake and real news are evenly distributed in train and test sets

# Training a classifier

### Logistic Regression (Baseline)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
#Checking training accuracy
y_train_pred = model.predict(X_train)
print("Training Accuracy:", accuracy_score(y_train, y_train_pred))

Training Accuracy: 0.9923079074711197


In [None]:
y_pred = model.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.98746923249049
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      4242
           1       0.99      0.99      0.99      4696

    accuracy                           0.99      8938
   macro avg       0.99      0.99      0.99      8938
weighted avg       0.99      0.99      0.99      8938

[[4196   46]
 [  66 4630]]


The model has a 98.7% accuracy.  
False Positives (46): Real articles predicted as fake.
False Negatives (66): Fake articles predicted as real.

There is no overfitting as the training and testing accuracy are extremely close.

# Saving the model and vectorizing

In [None]:
import joblib

In [None]:
# Save the model
joblib.dump(model, 'fake_news_model.pkl')

# Save the vectorizer
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

# Real Time News Detection


In [None]:
import joblib
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
#PreProcessing for the model
def preprocess_text(text):
  #lowercase everything
  text = text.lower()
  #Remove URLS
  text = re.sub(r'http\S+|www.\S+','',text)
  #Remove HTML tags
  text = re.sub(r'<.*?>', '', text)
  #Remove punctuations and digits
  text = re.sub(r'[^a-z\s]', '', text)
  #Tokenize
  tokens = nltk.word_tokenize(text)
  #Remove stop words and lemmatize
  clean_tokens = [lemmatizer.lemmatize(word)
                  for word in tokens
                  if word not in stop_words]
  return ' '.join(clean_tokens)

In [None]:
#Load model and vectorizer
model = joblib.load('/content/fake_news_model.pkl')
vectorizer = joblib.load('/content/tfidf_vectorizer.pkl')

In [None]:
#Define Prediction Function
def predict_news(news_text):
    # Step 1: Preprocess the input
    cleaned_text = preprocess_text(news_text) # output of preprocess_text is stored in cleaned_text
    # Step 2: Transform with TF-IDF
    vector = vectorizer.transform([cleaned_text]) # Pass cleaned_text to vectorizer.transform()
    # Step 3: Predict with model
    prediction = model.predict(vector)
    # Step 4: Return result
    if prediction[0] == 0:
        return "Real News 🟢"
    else:
        return "Fake News 🔴"

In [None]:
#Sample
sample = "NEW YORK (Reuters) - Sweeping U.S. tax legislation appears to be on the verge of approval, lifting the prospects in particular for banks, telecoms, transports and other industries that stand to gain the most from lower corporate tax rates. The Republican-led U.S. House of Representatives hit a last-minute snag on Tuesday in their drive to approve the legislation favored by President Donald Trump. The plan on Capitol Hill was for the Senate to delete three offending provisions in the House version and vote on the bill, then send it back to the House for a vote on Wednesday. The bill slashes the corporate income tax rate to 21 percent from 35 percent. That would boost overall earnings for S&P 500 companies by 9.1 percent, according to UBS equity strategists. For an interactive graphic on how the bill ripples through industries: tmsnrt.rs/2kf26gx Momentum behind the tax bill over the past month has helped propel the stock market, which had already rallied sharply this year, to fresh record highs. The S&P 500 has climbed about 5 percent since mid-November when the House of Representatives passed its tax overhaul bill. But the bill, which also includes a one-time tax on profits held overseas and industry-specific measures, would benefit some stocks, industries and sectors more than others.  The industries that stand to benefit most from the lower rates are telecoms, transportation, retail and banks, analysts said. But for some groups, such as tech and healthcare, the impact is more mixed.  Domestically geared healthcare companies that focus on services are poised to benefit from the lower tax rate. Hospital operator Universal Health Services Inc, lab-testing company Quest Diagnostics Inc and drug wholesaler Cardinal Health Inc are among the service companies set to benefit the most, according to Mizuho Securities. â€œWe believe tax reform should be a significant positive cash flow event, especially for healthcare services companies that tend to have limited international exposure and significant capital expenditures,â€ Mizuho analysts said in a research note. While many large drugmakers already report adjusted tax rates in the low 20 percent range, a number of companies would benefit from the ability to bring back overseas cash, JPMorgan analyst Chris Schott said in a recent note. According to Schott, Pfizer Inc, with $160 billion in offshore earnings, and Merck & Co Inc, with $70 billion, are particularly poised to gain from repatriating overseas funds. Banks are expected to be among the biggest winners from a lower tax rate. The S&P 500 banks index has soared 9 percent since mid-November as the tax bill began moving swiftly through Congress.     Of the major S&P sectors, financials pay the highest effective tax rate at 27.5 percent, according to a Wells Fargo analysis of historical tax rates. Large U.S. banks will see an average 13 percent increase to earnings per share from the lower rate, according to Goldman Sachs analysts, with Wells Fargo & Co and PNC Financial Services Group having the biggest gains. Citizens Financial Group, Regions Financial Corp and M&T Bank Corp would see sizable earnings benefits and are also poised to be relative winners among large bank stocks, UBS analyst Saul Martinez said in a recent note. Banks could benefit indirectly if the tax bill provides an economic boost that spurs increased lending and higher interest rates. The technology sector, which had led the marketâ€™s rally for most of 2017, has underperformed the S&P 500 as the tax bill moved forward in Congress. Tech is expected to benefit less than most other sectors from a drop in the corporate rate, with an earnings boost of 5.3 percent, according to UBS. Semiconductors, whose shares have had a particularly rough ride in the past month, are expected to see earnings drop by 3.3 percent due to the overall bill, according to UBS. â€œMany chip companies have extensive international operations and relatively low blended tax rates,â€ Wells Fargo analysts said in a recent note. â€œWe see the possibility of changes in the U.S. tax rules as a potential risk for such companies.â€ One area where large tech companies could benefit is by spending cash held overseas for uses such as stock buybacks that boost earnings per share. UBS points to Cisco Systems Inc and Qualcomm Inc as companies that could see among the biggest buyback boosts. â€œThe tech sector would certainly be among the largest beneficiaries if cash stashed overseas can be repatriated at a low rate and presumably used for stock buybacks or dividends,â€ according to a recent note from Ed Yardeni, president of Yardeni Research.  "
print(predict_news(sample))

Real News 🟢


# Web App using Streamlit

In [None]:
pip install streamlit

In [None]:
import streamlit as st
import joblib
import re
import string

In [None]:
#Load model and vectorizer
model = joblib.load('/content/fake_news_model.pkl')
vectorizer = joblib.load('/content/tfidf_vectorizer.pkl')

In [None]:
#PreProcessing for the model
def preprocess_text(text):
  #lowercase everything
  text = text.lower()
  #Remove URLS
  text = re.sub(r'http\S+|www.\S+','',text)
  #Remove HTML tags
  text = re.sub(r'<.*?>', '', text)
  #Remove punctuations and digits
  text = re.sub(r'[^a-z\s]', '', text)
  #Tokenize
  tokens = nltk.word_tokenize(text)
  #Remove stop words and lemmatize
  clean_tokens = [lemmatizer.lemmatize(word)
                  for word in tokens
                  if word not in stop_words]
  return ' '.join(clean_tokens)

In [None]:
#Define Prediction Function
def predict_news(news_text):
    # Step 1: Preprocess the input
    cleaned_text = preprocess_text(news_text) # output of preprocess_text is stored in cleaned_text
    # Step 2: Transform with TF-IDF
    vector = vectorizer.transform([cleaned_text]) # Pass cleaned_text to vectorizer.transform()
    # Step 3: Predict with model
    prediction = model.predict(vector)
    # Step 4: Return result
    if prediction[0] == 0:
        return "Real News 🟢"
    else:
        return "Fake News 🔴"

In [None]:
#SteeamLit UI
st.title("📰 Fake News Detection App")
st.write("Enter a news article or headline below:")

In [None]:
user_input = st.text_area("News Text")

if st.button("Predict"):
    if user_input.strip() != "":
        result = predict_news(user_input)
        st.subheader("Prediction:")
        st.success(result)
    else:
        st.warning("Please enter some news text first.")

2025-05-12 07:48:40.868 Session state does not function when running a script without `streamlit run`


In [None]:
streamlit run app.py

SyntaxError: invalid syntax (<ipython-input-36-718866ff34b9>, line 1)