In [41]:
import pandas as pd
import numpy as np
import re
#NLP kit
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
#for training and testing data
from sklearn.model_selection import train_test_split
#multiple models
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 1.Data Gathering

In [3]:
df = pd.read_csv("News_dataset.csv")
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


# 2. Data Analysis

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [5]:
df['label'].value_counts()

label
1    10413
0    10387
Name: count, dtype: int64

In [6]:
df.shape

(20800, 5)

In [7]:
df.isna().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [9]:
#Drop null columns since its text
df=df.dropna()
df.isna().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [11]:
df.shape

(18285, 5)

In [12]:
df.reset_index(inplace=True)
df.head()

Unnamed: 0,index,id,title,author,text,label
0,0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [13]:
df['title'][0]

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'

In [14]:
df = df.drop(['id','text','author'],axis = 1)
df.head()

Unnamed: 0,index,title,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0
2,2,Why the Truth Might Get You Fired,1
3,3,15 Civilians Killed In Single US Airstrike Hav...,1
4,4,Iranian woman jailed for fictional unpublished...,1


# 3.Data Preprocessing

# 1. Tokenization

In [16]:
sample_data = 'The quick brown fox jumps over the lazy dog'
sample_data = sample_data.split()
sample_data

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

# 2. Make Lowercase

In [17]:
sample_data = [data.lower() for data in sample_data]
sample_data

['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

# 3. Stopwords

In [18]:
stopwords = stopwords.words('english')
print(stopwords[0:10])
print(len(stopwords))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']
198


In [19]:
sample_data = [data for data in sample_data if data not in stopwords]
print(sample_data)
len(sample_data)

['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog']


6

# 4.Stemming

In [20]:
ps = PorterStemmer()
sample_data_stemming = [ps.stem(data) for data in sample_data]
print(sample_data_stemming)

['quick', 'brown', 'fox', 'jump', 'lazi', 'dog']


# 5.Lemmatization

In [21]:
lm = WordNetLemmatizer()
sample_data_lemma = [lm.lemmatize(data) for data in sample_data]
print(sample_data_lemma)

['quick', 'brown', 'fox', 'jump', 'lazy', 'dog']


In [33]:
corpus = []
for i in range(len(df)):
    review = re.sub('^a-zA-Z0-9',' ',df['title'][i])
    review = review.lower()
    review = review.split()
    review = [lm.lemmatize(x) for x in review if x not in stopwords]
    review = " ".join(review)
    corpus.append(review)

In [28]:
len(corpus)

18285

In [34]:
df['title'][0]

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'

In [35]:
corpus[0]

'house dem aide: didn’t even see comey’s letter jason chaffetz tweeted'

# 4.Vectorization

In [36]:
#Converting text data into the vector
tf = TfidfVectorizer()
x = tf.fit_transform(corpus).toarray()
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [37]:
y= df['label']
y.head()

0    1
1    0
2    1
3    1
4    1
Name: label, dtype: int64

# Data splitting into the train and test

In [48]:
x_train, x_test, y_train, y_test = train_test_split(df['title'],y, test_size = 0.3, random_state = 10, stratify = y )

In [57]:
len(x_train),len(y_train)

(12799, 12799)

In [58]:
len(x_test), len(y_test)

(5486, 5486)

# 5. Model Building

In [47]:
# Ensure the 'text' column contains strings
df['title'] = df['title'].astype(str)

In [42]:
# Initialize a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

In [49]:
# Fit and transform the training data
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)

# Transform the test data
x_test_tfidf = tfidf_vectorizer.transform(x_test)

In [51]:
# Initialize and train the RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train_tfidf, y_train)

In [52]:
# Predict on the test set
y_pred = rf.predict(x_test_tfidf)

In [54]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 93.78%
Confusion Matrix:
[[2836  273]
 [  68 2309]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.91      0.94      3109
           1       0.89      0.97      0.93      2377

    accuracy                           0.94      5486
   macro avg       0.94      0.94      0.94      5486
weighted avg       0.94      0.94      0.94      5486



In [55]:
# Define a function to train and evaluate models
def train_and_evaluate_model(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    
    print(f"Model: {model.__class__.__name__}")
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(class_report)
    print("-" * 50)


In [56]:
# Initialize models
models = [
    RandomForestClassifier(),
    LogisticRegression(max_iter=1000),
    SVC(kernel='linear'),
    MultinomialNB(),
    XGBClassifier(use_label_encoder=False, eval_metric='logloss')
]

# Train and evaluate each model
for model in models:
    train_and_evaluate_model(model, x_train_tfidf, y_train, x_test_tfidf, y_test)

Model: RandomForestClassifier
Accuracy: 93.77%
Confusion Matrix:
[[2828  281]
 [  61 2316]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.91      0.94      3109
           1       0.89      0.97      0.93      2377

    accuracy                           0.94      5486
   macro avg       0.94      0.94      0.94      5486
weighted avg       0.94      0.94      0.94      5486

--------------------------------------------------
Model: LogisticRegression
Accuracy: 92.76%
Confusion Matrix:
[[2762  347]
 [  50 2327]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.89      0.93      3109
           1       0.87      0.98      0.92      2377

    accuracy                           0.93      5486
   macro avg       0.93      0.93      0.93      5486
weighted avg       0.93      0.93      0.93      5486

--------------------------------------------------
Model: SVC
Accurac

Parameters: { "use_label_encoder" } are not used.



Model: XGBClassifier
Accuracy: 92.73%
Confusion Matrix:
[[2788  321]
 [  78 2299]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.90      0.93      3109
           1       0.88      0.97      0.92      2377

    accuracy                           0.93      5486
   macro avg       0.93      0.93      0.93      5486
weighted avg       0.93      0.93      0.93      5486

--------------------------------------------------


# Predicton Pipeline

In [83]:
from sklearn.pipeline import Pipeline

# Create a pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.7)),  # TF-IDF Vectorizer
    ('classifier', RandomForestClassifier())  # Random Forest Classifier
])

# Train the pipeline
pipeline.fit(x_train, y_train)

In [84]:
import joblib

# Save the pipeline to a file
joblib.dump(pipeline, 'fake_news_classifier_pipeline.pkl')

['fake_news_classifier_pipeline.pkl']

In [87]:
# Load the pipeline from the file
loaded_pipeline = joblib.load('fake_news_classifier_pipeline.pkl')

# New data for prediction
new_data = [
    "FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart",
    "Scientists have discovered a new planet that could support life.",
    "Andhra Pradesh's Chief Minister is Y.S Jagan Mohan Reddy in 2025"
]

# Make predictions
predictions = loaded_pipeline.predict(new_data)

# Map predictions to labels
prediction_labels = ["The news is fake" if pred == 0 else "The news is real" for pred in predictions]

# Print the results
for text, label in zip(new_data, prediction_labels):
    print(f"Text: {text}\nPrediction: {label}\n")

Text: FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart
Prediction: The news is fake

Text: Scientists have discovered a new planet that could support life.
Prediction: The news is real

Text: Andhra Pradesh's Chief Minister is Y.S Jagan Mohan Reddy in 2025
Prediction: The news is real

