The Fake News Recognization Model

In [3]:
import numpy as np #For performing numerical computations also to store data in arrays.
from flask import Flask, render_template, request #For creating web application
import pickle #For saving and loading the model
import requests #For fetching data from web
import traceback #For tracking errors
from bs4 import BeautifulSoup #For parsing HTML and XML documents
import pandas as pd #For performing data manipulaion, analysis, cleaning and others.
import re #For Regular Expression used for text manipulation, analysis.
from nltk.corpus import stopwords #Natural Language ToolKit
from nltk.stem.porter import PorterStemmer #Reducing words to their root form.
from sklearn.feature_extraction.text import TfidfVectorizer #Convert text to data into numerical data.
from sklearn.model_selection import train_test_split #split training and testing data
from sklearn.linear_model import LogisticRegression #For Classification
from sklearn.metrics import accuracy_score #Calculating Accuracy

In [2]:

# Initialize PorterStemmer
port_stem = PorterStemmer()

# Load the saved model and vectorizer
try:
    with open('models/fake_news_model.pkl', 'rb') as file:
        model = pickle.load(file)
    with open('models/vectorizer.pkl', 'rb') as file:
        vectorizer = pickle.load(file)
    print("Model and vectorizer loaded successfully")
except Exception as e:
    print(f"Error loading model or vectorizer: {e}")
    model = None
    vectorizer = None


Model and vectorizer loaded successfully


In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# printing the stopwords in English
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

*Data Pre-processing*

In [None]:
# loading the dataset to a pandas DataFrame
news_dataset = pd.read_csv('train.csv')
news_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [6]:
news_dataset.shape

(20800, 5)

In [7]:
# counting the number of missing values in the dataset
news_dataset.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [8]:
# replacing the null values with empty string
news_dataset = news_dataset.fillna('')

In [9]:
# merging the author name and news title
news_dataset['content'] = news_dataset['author']+': '+news_dataset['title']
print(news_dataset['content'])



0        Darrell Lucus: House Dem Aide: We Didn’t Even ...
1        Daniel J. Flynn: FLYNN: Hillary Clinton, Big W...
2        Consortiumnews.com: Why the Truth Might Get Yo...
3        Jessica Purkiss: 15 Civilians Killed In Single...
4        Howard Portnoy: Iranian woman jailed for ficti...
                               ...                        
20795    Jerome Hudson: Rapper T.I.: Trump a ’Poster Ch...
20796    Benjamin Hoffman: N.F.L. Playoffs: Schedule, M...
20797    Michael J. de la Merced and Rachel Abrams: Mac...
20798    Alex Ansary: NATO, Russia To Hold Parallel Exe...
20799             David Swanson: What Keeps the F-35 Alive
Name: content, Length: 20800, dtype: object


In [10]:
# separating the data & label
X = news_dataset.drop(columns='label', axis=1)
Y = news_dataset['label']
print(X)
print(Y)

          id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
0                                  Darrell Lucus   
1                                Daniel J. Flynn   
2                             Consortiu

Stemming:

Stemming is the process of reducing a word to its Root word

example: actor, actress, acting --> act

In [None]:
# Create a Porter Stemmer object that will help us reduce words to their root form
port_stem = PorterStemmer()

def stemming(content):
    # Remove all characters that are not letters (a-z or A-Z) and replace with spaces
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    
    # Convert all text to lowercase for consistency
    stemmed_content = stemmed_content.lower()
    
    # Split the text into individual words
    stemmed_content = stemmed_content.split()
    
    # For each word: stem it and keep only if it's not a stopword (common words like 'the', 'is', 'at')
    # This creates a list of important root words
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    
    # Join all the processed words back together with spaces between them
    return ' '.join(stemmed_content)


In [12]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [13]:
print(news_dataset['content'])

0        darrel lucus: hous dem aide: didn’t even see c...
1        daniel j. flynn: flynn: hillari clinton, big w...
2                  consortiumnews.com: truth might get fir
3        jessica purkiss: 15 civilian kill singl us air...
4        howard portnoy: iranian woman jail fiction unp...
                               ...                        
20795    jerom hudson: rapper t.i.: trump ’poster child...
20796    benjamin hoffman: n.f.l. playoffs: schedule, m...
20797    michael j. de la merc rachel abrams: macy’ sai...
20798    alex ansary: nato, russia hold parallel exerci...
20799                          david swanson: keep f-35 al
Name: content, Length: 20800, dtype: object


In [None]:
#separating the data and label
X = news_dataset['content']
Y = news_dataset['label']
print(X)



In [None]:
print(Y)

In [15]:
Y.shape

(20800,)

In [16]:
# Converting the textual data to numerical data using TF-IDF Vectorization
# TF-IDF (Term Frequency-Inverse Document Frequency) measures the importance of words in documents

# Create a TF-IDF vectorizer object
vectorizer = TfidfVectorizer()

# Fit the vectorizer to learn the vocabulary from the text data
# This analyzes all documents to build the vocabulary and calculate IDF values
X = vectorizer.fit_transform(X)

# Transform the text data into TF-IDF feature vectors
# For each document:
# - Counts word frequencies (TF)
# - Weights them by how rare/common they are across all documents (IDF)
# - Creates a sparse matrix where each row is a document and each column is a word




Splitting the dataset to training & test data

In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

Training the Model: Logistic Regression

In [18]:
model = LogisticRegression()
model.fit(X_train, Y_train)

Evaluation
accuracy score

In [19]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [20]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9867788461538461


Making a Predictive System

In [21]:
X_new = X_test[10]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[0]
The news is Real


In [None]:
print(Y_test[0])


In [23]:
# Calculate accuracy on test set
Y_pred = model.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# You can also add other metrics
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred))

Model Accuracy: 0.98
              precision    recall  f1-score   support

           0       0.99      0.96      0.98      2077
           1       0.96      0.99      0.98      2083

    accuracy                           0.98      4160
   macro avg       0.98      0.98      0.98      4160
weighted avg       0.98      0.98      0.98      4160



Saving model for future use.

In [24]:
import pickle
# Save the model
with open('fake_news_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Save the vectorizer
with open('vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

Prediction function for new articles.

In [25]:
def predict_news(news_text):
    
    try:
        if not model or not vectorizer:
            raise RuntimeError("Model or vectorizer not loaded")
        
        
        # Preprocess the text
        processed_text = stemming(news_text)
    
        if not processed_text:
            raise ValueError("Text preprocessing failed")
        
        # Transform using the saved vectorizer
        text_vector = vectorizer.transform([processed_text])
    
        # Make prediction
        prediction = model.predict(text_vector)
    
        return "Real News" if prediction[0] == 0 else "Fake News" 
    
    except Exception as e:
        raise Exception(f"Error in prediction: {str(e)}")

Extracting text from URLs

In [None]:
def extract_text_from_url(url):
    """Extract text content from a URL"""
    try:
        # Send request with headers
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
        
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        # Parse HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract text from paragraphs
        paragraphs = soup.find_all('p')
        text = ' '.join([p.get_text() for p in paragraphs])
        
        if not text:
            raise ValueError("No text content found in the URL")
        
        return text
    except Exception as e:
        raise Exception(f"Error extracting text from URL: {str(e)}")

In [None]:
# Initialize Flask app
app = Flask(__name__)
@app.route('/')
def home():
   return render_template('index.html')

@app.route('/predict', methods=['POST'])
def predict():
   try:
      url = request.form.get('url')
      if not url:
         return render_template('index.html', error="Please enter a URL")

      # Extract text from URL
      news_text = extract_text_from_url(url)
        
      # Make prediction
      result = predict_news(news_text)
        
      return render_template('index.html', url=url, result=result, text_preview=news_text[:500] + "...")
   
   except Exception as e:
      return render_template('index.html', url=url,  error=str(e))

if __name__ == '__main__':
    if model and vectorizer:
        app.run(debug=True)
    else:
        print("Application cannot start: Model or vectorizer not loaded")

In [None]:
# loading the dataset to a pandas DataFrame
fake_dataset = pd.read_csv('archive/fake.csv')
fake_dataset.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"
