<a href="https://colab.research.google.com/github/Nsimaar99/Kaggle-Project/blob/master/Fake_News.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Importing necessary libraries for data manipulation, model training, evaluation, and text processing

import numpy as np  # For numerical computations and handling arrays
import pandas as pd  # For data manipulation and analysis, especially for handling tabular data

# Importing machine learning utilities from scikit-learn
from sklearn.model_selection import train_test_split  # For splitting the dataset into training and testing sets
from sklearn.linear_model import LogisticRegression  # Logistic Regression model for binary classification
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  # For model evaluation metrics

import pickle  # For saving and loading Python objects like trained models

# Libraries for text preprocessing
import re  # Regular expressions for text cleaning (e.g., removing special characters)
import nltk  # Natural Language Toolkit for text processing tasks
from nltk.corpus import stopwords  # For removing common stopwords like "and", "the"
from nltk.stem.porter import PorterStemmer  # For stemming words (reducing words to their root form)

# Text vectorization technique to convert text data into numerical format
from sklearn.feature_extraction.text import TfidfVectorizer  # Converts text into TF-IDF feature vectors

# Visualization libraries
import seaborn as sns  # For data visualization, particularly heatmaps and plots
import matplotlib.pyplot as plt  # For plotting charts and graphs


In [6]:
# Download stop words
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
# Printing the stopwords in english
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [8]:
# loading the dataset into pandas dataframe
news_dataset = pd.read_csv('/content/FakeNewsNet.csv')

In [10]:
news_dataset.shape

(23196, 5)

In [11]:
news_dataset.head()

Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


In [12]:
# Counting the number of missing values in the dataset
news_dataset.isnull().sum()

Unnamed: 0,0
title,0
news_url,330
source_domain,330
tweet_num,0
real,0


In [14]:
# Handling the missing values with null string
news_dataset = news_dataset.fillna('')

In [15]:
# Merging the source domain and news title
news_dataset['content'] = news_dataset['source_domain']+' '+news_dataset['title']

In [16]:
print(news_dataset['content'])

0        toofab.com Kandi Burruss Explodes Over Rape Ac...
1        www.today.com People's Choice Awards 2018: The...
2        www.etonline.com Sophia Bush Sends Sweet Birth...
3        www.dailymail.co.uk Colombian singer Maluma sp...
4        www.zerchoo.com Gossip Girl 10 Years Later: Ho...
                               ...                        
23191    www.express.co.uk Pippa Middleton wedding: In ...
23192    hollywoodlife.com Zayn Malik & Gigi Hadid’s Sh...
23193    www.justjared.com Jessica Chastain Recalls the...
23194    www.intouchweekly.com Tristan Thompson Feels "...
23195    www.billboard.com Kelly Clarkson Performs a Me...
Name: content, Length: 23196, dtype: object


In [18]:
# Seperating the data and label columns
X = news_dataset.drop(columns='real', axis=1)
Y = news_dataset['real']


In [24]:
# Stemming: Reducing a word to its root word
port_stem = PorterStemmer()

# Function to perform stemming and text preprocessing
def stemming(content):
    # Step 1: Remove non-alphabetical characters
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)

    # Step 2: Convert text to lowercase
    stemmed_content = stemmed_content.lower()

    # Step 3: Tokenize the text (split into individual words)
    stemmed_content = stemmed_content.split()

    # Step 4: Remove stopwords and apply stemming
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if word not in stopwords.words('english')]

    # Step 5: Join the processed words back into a single string
    return ' '.join(stemmed_content)

In [25]:
# Applying the funtion to the content column
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [26]:
# Seperating the content column and the real column
X = news_dataset['content'].values
Y = news_dataset['real'].values

In [29]:
# Converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [30]:
print(X)

  (0, 59)	0.2597097788538541
  (0, 651)	0.30737565657585053
  (0, 1707)	0.38749399933436923
  (0, 2460)	0.0508218665886508
  (0, 4193)	0.35506164978733906
  (0, 5891)	0.26124620267491605
  (0, 6647)	0.38749399933436923
  (0, 10223)	0.302158398468634
  (0, 10276)	0.23128301802698967
  (0, 10569)	0.2543303205934995
  (0, 12980)	0.2946734912504806
  (0, 13681)	0.21108934405813046
  (1, 729)	0.3083771474680274
  (1, 1101)	0.33844294892026705
  (1, 1916)	0.3875096512688879
  (1, 2219)	0.42251489448694257
  (1, 2460)	0.08159478582438673
  (1, 7432)	0.33284454859410584
  (1, 9476)	0.23697414575567807
  (1, 10347)	0.37702288594702094
  (1, 12946)	0.36817044163275126
  (1, 14274)	0.10189043319535993
  (2, 1201)	0.19486226938688445
  (2, 1530)	0.3779018214766305
  (2, 1711)	0.3321987353251551
  :	:
  (23194, 4387)	0.2692478125672807
  (23194, 5800)	0.2632849856372537
  (23194, 6240)	0.28925922464908643
  (23194, 6661)	0.17479540394859372
  (23194, 6797)	0.28785051901050934
  (23194, 6994)	0.2638

In [69]:
# Splitting the training dataset and text dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

In [70]:
print(X.shape, X_train.shape, X_test.shape)

(23196, 14435) (18556, 14435) (4640, 14435)


In [71]:
print(X_test.shape)

(4640, 14435)


In [72]:
# Initialize the model
model = LogisticRegression()

# Train the model
model.fit(X_train, Y_train)


In [74]:
# Make predictions on the training data
Y_train_pred = model.predict(X_train)

# Calculate accuracy on the training set
train_accuracy = accuracy_score(Y_train, Y_train_pred)
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")

# Generate the classification report for training data
print("Training Classification Report:\n", classification_report(Y_train, Y_train_pred))

# Generate the confusion matrix for training data
print("Training Confusion Matrix:\n", confusion_matrix(Y_train, Y_train_pred))

Training Accuracy: 88.27%
Training Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.58      0.71      4604
           1       0.88      0.98      0.93     13952

    accuracy                           0.88     18556
   macro avg       0.89      0.78      0.82     18556
weighted avg       0.89      0.88      0.87     18556

Training Confusion Matrix:
 [[ 2686  1918]
 [  258 13694]]


In [75]:
# Make predictions on the test data
Y_test_pred = model.predict(X_test)

# Calculate accuracy on the test set
test_accuracy = accuracy_score(Y_test, Y_test_pred)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Generate the classification report for test data
print("Test Classification Report:\n", classification_report(Y_test, Y_test_pred))

# Generate the confusion matrix for test data
print("Test Confusion Matrix:\n", confusion_matrix(Y_test, Y_test_pred))

Test Accuracy: 85.26%
Test Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.50      0.63      1151
           1       0.85      0.97      0.91      3489

    accuracy                           0.85      4640
   macro avg       0.85      0.73      0.77      4640
weighted avg       0.85      0.85      0.84      4640

Test Confusion Matrix:
 [[ 575  576]
 [ 108 3381]]


In [83]:
from scipy.sparse import csr_matrix

# Define your prediction function for a single news article
def predict_news(model, X_test, index):
    # Check if the index is valid
    if index < 0 or index >= X_test.shape[0]:
        return "Invalid index: Out of range."

    X_sample = X_test[index]  # Select the sample

    if isinstance(X_sample, csr_matrix):
        X_sample_dense = X_sample.toarray()  # Convert to dense if needed
    else:
        X_sample_dense = X_sample  # Assume it's already dense

    # Reshape the sample for prediction
    X_sample_reshaped = X_sample_dense.reshape(1, -1)
    prediction = model.predict(X_sample_reshaped)  # Make the prediction

    # Print the prediction
    if prediction[0] == 0:
        return 'The news is Real'
    else:
        return 'The news is Fake'

result = predict_news(model, X_test, 5)
print(result)


The news is Fake
