In [31]:
# The starting point of this project was based on a tutorial from the DataFlair website, with modifications made to the dataset and model.


# importing all the required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer # used to convert text data into numerical features
# used to implement passive-aggressive algorithm, which is a classifier used for binary classification
from sklearn.linear_model import PassiveAggressiveClassifier 
# used for evaluating the performance of the model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# using kaggle dataset
import kagglehub
import os



In [2]:
# Download the dataset
dataset_path = kagglehub.dataset_download("clmentbisaillon/fake-and-real-news-dataset")
#print("Path to dataset files:", dataset_path)


# Replace with the dataset file name
fake_news_file = os.path.join(dataset_path, "Fake.csv")
real_news_file = os.path.join(dataset_path, "True.csv")


# Reading the dataset into dataframes using the pandas library
fake_data = pd.read_csv(fake_news_file)
real_data = pd.read_csv(real_news_file)


In [24]:
# Adding labels to each dataset
fake_data['label'] = 'FAKE'
real_data['label'] = 'REAL'

# Merging the datasets into a single dataframe
df = pd.concat([fake_data, real_data], ignore_index=True)

# Shuffling the dataset to ensure randomness
df = df.sample(frac=1, random_state=7).reset_index(drop=True)

# Checking the structure of the dataset
print(df.head())
print(df.columns)

# Getting the labels from the dataframe
labels = df['label']



                                               title  \
0  Senate backs massive increase in military spen...   
1   Anonymous GOP Reps Admit Clinton Will Win, Bl...   
2   ‘A Gimmick’: It Just Dawned On Republicans Th...   
3  USA's Tillerson and Saudi crown prince discuss...   
4  U.S. Senate confirms Acosta to head Labor Depa...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The U.S. Senate passed ...  politicsNews   
1  Republicans are already predicting doom and gl...          News   
2  While on the campaign trail, reality show star...          News   
3  RIYADH (Reuters) - U.S. Secretary of State Rex...     worldnews   
4  WASHINGTON (Reuters) - R. Alexander Acosta was...  politicsNews   

                  date label  
0  September 18, 2017   REAL  
1      August 11, 2016  FAKE  
2      January 7, 2017  FAKE  
3   November 20, 2017   REAL  
4      April 27, 2017   REAL  
Index(['title', 'text', 'subject', 'date', 'label'], dty

In [28]:
# Splitting the dataset into training and testing sets
x_train,x_test,y_train,y_test=train_test_split(df['text'], labels, test_size=0.2, random_state=7,  stratify=labels)

# df['text'] is the feature set, the model will use this text data to learn patterns (get trained)
# labels extracted previously is the target variable (fake or real news) which the model will predict
# test size is set to 0.2 which means 20% of the data will be used for testing and 80% will be used for training
# the seed (random_state) is set to 7 so we can get the same training and testing sets when using this seed
# the funciton returns 4 subsets: 
# 80% of the text data for training the model (x_train)
# 20% of the text data used for testing the model (x_test)
# Corresponding labels for the training data (y_train)
# Corresponding labels for the testing data (y_test)


In [29]:
# Initialzing the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8,max_features=5000, ngram_range=(1, 2))
# Based on stop_words='english, the common english words like 'and' or 'the' will not be considered when trying to distinguish if the news is fake or real
# Based on max_df=0.7, words that appear in more than 70% of the documents will be ignored.
# (because words with high document frequency (e.g., "news," "article") are less informative)
# By using ngram_range=(1, 2) the vectorizer considers both unigrams (individual words) and bigrams (two consecutive words) as features

# Fit and transform train set and test set
# fit_transform function first learns the parameters from the data and then transforms the data into its numerical representation.
# The vocabulary (set of unique words) and their corresponding IDF scores are learned from the training data and stored in the vectorizer object.
tfidf_train = tfidf_vectorizer.fit_transform(x_train)
# transform function applies the previously learned parameters (from fit_transform) to new data without re-learning them
# Here the vectorizer does not learn anything new. It applies the same vocabulary and IDF values to transform x_test into a numerical matrix.
# For words in the test data that are in the vocabulary, their TF-IDF scores are computed using the IDF values from the training data.
tfidf_test = tfidf_vectorizer.transform(x_test)



In [30]:
# Initializing a PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter=50, C=0.1)
pac.fit(tfidf_train, y_train)

# Predicting on the test set and calculating the accuracy of the model
y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy of the model: {round(score*100, 2)}%')

Accuracy of the model: 99.42%


In [32]:
# Printing a confusion matrix
# The result will show how many of the news were correctly predicted as fake or real
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL']) #first row is for the ones labeled as 'fake' and second row is for 'real' label


array([[4674,   22],
       [  30, 4254]])

In [33]:
print(classification_report(y_test, y_pred, target_names=['FAKE', 'REAL']))


              precision    recall  f1-score   support

        FAKE       0.99      1.00      0.99      4696
        REAL       0.99      0.99      0.99      4284

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

