In [3]:
#importing necessary libraries

import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import re
import string


In [5]:
# assigning fake and true news

data_fake = pd.read_csv('news_fake_scrapped_data.csv')
data_true = pd.read_csv('news_true_scrapped_data.csv', encoding='ISO-8859-1')

In [6]:
# few fake news data

data_fake.head()

Unnamed: 0,Category,Headline,Content,Published Date,Label
0,Business,Indian Economy Crosses 5 Trillion Lenskart Cre...,"According to the IMF, India has become the wor...",15-09-2022,0
1,Business,Man Arrested for Selling Oximeter as Cloth Cli...,"In a bizarre case in Pakistan's Punjab, a man ...",23-04-2022,0
2,Business,An MBA College Shuts down After all the Colleg...,Surprised and impressed with the quantity of t...,25-04-2022,0
3,Business,Elon Musk Asks Twitter Employees their Pronoun...,A day after Tesla CEO Elon Musk bought Twitter...,26-04-2022,0
4,Business,"Coca-Cola Refutes Elon Musk's Allegation, Clai...",Elon Musk had triggered a frenzy with his twee...,03-05-2022,0


In [7]:
# few true news data

data_true.head()

Unnamed: 0,Category,Headline,Content,Published Date,Label
0,Business,"BHEL gets 1,600 MW thermal power project from ...",The coal-based unit will be set up in the Kode...,"August 12, 2024",1
1,Business,Stock markets marginally up in volatile trade ...,"From the Sensex pack, Tata Motors, Sun Pharmac...","August 23, 2024",1
2,Business,"Majority blue-collar jobs pay less than Rs 20,...",Over 57.63% of blue-collar jobs fall within th...,"August 17, 2024",1
3,Business,ICICI Bank denies allegations that it paid sal...,The bank reiterated that all payments made to ...,"September 02, 2024",1
4,Business,Classic Legends unveils Jawa 42 FJ at ?1.99 lakh,We are creating a global company and will disr...,"September 03, 2024",1


In [8]:
# number of values in the dataset

data_fake.shape, data_true.shape

((7000, 5), (7000, 5))

In [9]:
# merging fake and true dataset

data_merge = pd.concat([data_fake , data_true], axis=0)
data_merge.head(10)

Unnamed: 0,Category,Headline,Content,Published Date,Label
0,Business,Indian Economy Crosses 5 Trillion Lenskart Cre...,"According to the IMF, India has become the wor...",15-09-2022,0
1,Business,Man Arrested for Selling Oximeter as Cloth Cli...,"In a bizarre case in Pakistan's Punjab, a man ...",23-04-2022,0
2,Business,An MBA College Shuts down After all the Colleg...,Surprised and impressed with the quantity of t...,25-04-2022,0
3,Business,Elon Musk Asks Twitter Employees their Pronoun...,A day after Tesla CEO Elon Musk bought Twitter...,26-04-2022,0
4,Business,"Coca-Cola Refutes Elon Musk's Allegation, Clai...",Elon Musk had triggered a frenzy with his twee...,03-05-2022,0
5,Business,Elon Musk Offers to Buy Taj Mahal to Find Out ...,In one of the world's biggest deals in the wor...,10-05-2022,0
6,Business,Ola Driver Asks Customer's CIBIL Score Before ...,Never a pleasant experience to have a cab driv...,16-05-2022,0
7,Business,Next Cred Ad To Show Bear Grylls Having Jain T...,The script of next Cred TV Ad campaign has lea...,23-05-2022,0
8,Business,SpiceJet to Start Road Services by Removing Pl...,"On Tuesday, SpiceJet's Delhi-Dubai flight was ...",05-07-2022,0
9,Business,Investor Uses magicpin Reward Points to buy Eq...,"On Friday, magicpin, India's largest platform ...",26-08-2022,0


In [10]:
# columns in merged dataset

data_merge.columns

Index(['Category', 'Headline', 'Content', 'Published Date', 'Label'], dtype='object')

In [11]:
# removing unwated columns from merged dataset

data = data_merge.drop(['Headline','Category','Published Date'], axis=1)

In [12]:
# printing dataset

data

Unnamed: 0,Content,Label
0,"According to the IMF, India has become the wor...",0
1,"In a bizarre case in Pakistan's Punjab, a man ...",0
2,Surprised and impressed with the quantity of t...,0
3,A day after Tesla CEO Elon Musk bought Twitter...,0
4,Elon Musk had triggered a frenzy with his twee...,0
...,...,...
6995,COPENHAGEN (Reuters) - Demands from the nation...,1
6996,PARIS (Reuters) - France s government on Tuesd...,1
6997,MOSCOW (Reuters) - Former militants from band...,1
6998,THE HAGUE (Reuters) - The daughter of an Irani...,1


In [13]:
#checking null values

data.isnull().sum()

Content    0
Label      0
dtype: int64

In [14]:
# shuffling the data to remove biasness

data = data.sample(frac=1)

In [15]:
# printing shuffled data from starting

data.head()

Unnamed: 0,Content,Label
6559,SANTIAGO (Reuters) - Billionaire former Presid...,1
4947,"(Reuters) - Michael Flynn, President Donald Tr...",1
3348,President Trump greeted the NHL s Stanley Cup ...,0
3060,Content not found,1
4663,Donald Trump has been extremely active on Twit...,0


In [16]:
# printing shuffled data from end

data.tail()

Unnamed: 0,Content,Label
2613,Surjewala said the partys MLAs and the high c...,1
4720,WASHINGTON (Reuters) - The top Republican in t...,1
1744,We re not sure why Fox News host Sean Hannity ...,0
6709,President Trump s administration has told the ...,0
5578,WASHINGTON (Reuters) - The U.S. attorneyâs o...,1


In [17]:
# removing the shuffled index column

data.reset_index(inplace=True)
data.drop(['index'],axis=1, inplace=True)

In [18]:
# printing columns in dataset after removing

data.columns

Index(['Content', 'Label'], dtype='object')

In [19]:
# printing shuffled dataset after giving proper index

data.head()

Unnamed: 0,Content,Label
0,SANTIAGO (Reuters) - Billionaire former Presid...,1
1,"(Reuters) - Michael Flynn, President Donald Tr...",1
2,President Trump greeted the NHL s Stanley Cup ...,0
3,Content not found,1
4,Donald Trump has been extremely active on Twit...,0


In [20]:
# data preprocessing

def wordopt(Content):
    
    # remove digits
    Content = re.sub(r'\d', '', Content)
    
    # convert text into lowercase
    Content = Content.lower()
    
    # Remove content within brackets
    Content = re.sub('\[.*?\]', '', Content)
    
    # Remove non-word characters
    Content = re.sub("\\W", " ", Content)
    
    # remove URLs
    Content = re.sub('https?://\S+|www\.\S+', '', Content)
    
    # remove HTML tags
    Content = re.sub('<.*?>+', '', Content)
    
    # remove punctuation
    Content = re.sub('[%s]'% re.escape(string.punctuation), '',Content)
    
    # remove newline characters
    Content = re.sub('\n', '', Content)
    
    # Remove words containing numbers
    Content = re.sub('\w*\d\w*', '',Content)
    
    return Content

In [21]:
# applying the preprocessing function to wordopt

data['Content'] = data['Content'].apply(wordopt)

In [22]:
# printing data

data

Unnamed: 0,Content,Label
0,santiago reuters billionaire former presid...,1
1,reuters michael flynn president donald tr...,1
2,president trump greeted the nhl s stanley cup ...,0
3,content not found,1
4,donald trump has been extremely active on twit...,0
...,...,...
13995,surjewala said the party s mlas and the high c...,1
13996,washington reuters the top republican in t...,1
13997,we re not sure why fox news host sean hannity ...,0
13998,president trump s administration has told the ...,0


In [23]:
# dividing the columns into x and y variables

x = data['Content']
y = data['Label']

In [24]:
# dividint the data into training and testing sets

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3)

In [25]:
# number of data in training set

x_train.shape

(9800,)

In [26]:
# number of data in testing set

x_test.shape

(4200,)

In [27]:
# applying feature extraction
# Define the TF-IDF Vectorizer with n-gram range

tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

In [28]:
# Fit and transform the training data

X_train_tfidf = tfidf_vectorizer.fit_transform(x_train)

In [29]:
# Transform the test data using the fitted vectorizer

X_test_tfidf = tfidf_vectorizer.transform(x_test)

In [30]:
# Convert the TF-IDF transformed training data to a DataFrame for better visualization

X_train_tfidf_df = pd.DataFrame(X_train_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())


In [31]:
# Print the first few rows of the DataFrame

print(X_train_tfidf_df.head())

   abc  abc news  ability  ability to  able  able to  abortion     about  \
0  0.0       0.0      0.0         0.0   0.0      0.0       0.0  0.033626   
1  0.0       0.0      0.0         0.0   0.0      0.0       0.0  0.022776   
2  0.0       0.0      0.0         0.0   0.0      0.0       0.0  0.072713   
3  0.0       0.0      0.0         0.0   0.0      0.0       0.0  0.000000   
4  0.0       0.0      0.0         0.0   0.0      0.0       0.0  0.000000   

   about her  about his  ...  you were  you will  young      your  yourself  \
0        0.0        0.0  ...       0.0       0.0    0.0  0.000000  0.000000   
1        0.0        0.0  ...       0.0       0.0    0.0  0.036967  0.060259   
2        0.0        0.0  ...       0.0       0.0    0.0  0.000000  0.000000   
3        0.0        0.0  ...       0.0       0.0    0.0  0.000000  0.000000   
4        0.0        0.0  ...       0.0       0.0    0.0  0.000000  0.000000   

   youtube  youtube com  zero  zone  zuma  
0      0.0          0.0 

In [32]:
# Save the DataFrame to a CSV file

X_train_tfidf_df.to_csv('tfidf_output_ngram.csv', index=False)

In [33]:
# model builiding
# using Logistic Regression
# Initialize the Logistic Regression model

LR = LogisticRegression()

# Train the model on the training data
LR.fit(X_train_tfidf,y_train)

In [34]:
# Predict the labels for the test data

pred_lr = LR.predict(X_test_tfidf)

In [35]:
# Evaluate the model's performance 

LR.score(X_test_tfidf,y_test)

0.9121428571428571

In [36]:
# Generate a classification report

print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.95      0.87      0.91      2102
           1       0.88      0.95      0.92      2098

    accuracy                           0.91      4200
   macro avg       0.91      0.91      0.91      4200
weighted avg       0.91      0.91      0.91      4200



In [37]:
# model builiding
# using Decision Tree Classifier
# Initialize the Decision Tree model

DT = DecisionTreeClassifier()

# Train the model on the training data
DT.fit(X_train_tfidf, y_train)

In [38]:
# Predict the labels for the test data

pred_dt = DT.predict(X_test_tfidf)

In [39]:
# Evaluate the model's performance

DT.score(X_test_tfidf,y_test)

0.9004761904761904

In [40]:
# Generate a classification report

print(classification_report(y_test,pred_dt))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      2102
           1       0.89      0.91      0.90      2098

    accuracy                           0.90      4200
   macro avg       0.90      0.90      0.90      4200
weighted avg       0.90      0.90      0.90      4200



In [41]:
# model builiding
# using Random Forest
# Initialize the Random Forest model

RF = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
RF.fit(X_train_tfidf, y_train)

In [42]:
# Predict the labels for the test data

pred_rf = RF.predict(X_test_tfidf)

In [43]:
# Evaluate the model's performance

RF.score(X_test_tfidf, y_test)

0.9280952380952381

In [44]:
# Generate a classification report

print(classification_report(y_test, pred_rf))

              precision    recall  f1-score   support

           0       0.98      0.87      0.92      2102
           1       0.88      0.99      0.93      2098

    accuracy                           0.93      4200
   macro avg       0.93      0.93      0.93      4200
weighted avg       0.93      0.93      0.93      4200



In [45]:
# function that checks whether the entered news is true or fake

def output_label(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "True News"
    
def manual_testing(news):
    testing_news = {"text" : [news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wordopt)
    new_x_test = new_def_test["text"]
    new_xv_test = tfidf_vectorizer.transform(new_x_test)
    pred_LR = LR.predict(new_xv_test)
    pred_DT = DT.predict(new_xv_test)
    pred_RF = RF.predict(new_xv_test)
    
    return print("\n\nLR Prediction: {} \nDT Prediction: {} \nRF Prediction: {}".format(output_label(pred_LR[0]),output_label(pred_DT[0]),output_label(pred_RF[0])))

In [57]:
# checking the input

news = str(input())
manual_testing(news)

B. Sai Sudharsan is one cricketer whose stock has risen meteorically in the last few years. The Tamil Nadu left-hander, who made his India debut last year, represented India-C in the recently concluded Duleep Trophy. In a chat at the Rural Development Trust (RDT) Stadium B-ground in Anantapur recently, he spoke at length about his career and ambitions.


LR Prediction: True News 
DT Prediction: Fake News 
RF Prediction: True News
