In [85]:
# importing all the required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer # used to convert text data into numerical features
# used to implement passive-aggressive algorithm, which is a classifier used for binary classification
from sklearn.linear_model import PassiveAggressiveClassifier 
# used for evaluating the performance of the model
from sklearn.metrics import accuracy_score, confusion_matrix



from sklearn.linear_model import LogisticRegression



In [86]:
# Reading the dataset into a dataframe
# Read the data
df = pd.read_csv("news.csv")

#Get shape and head (5 first records)
df.shape
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [87]:
# Getting the labels from the dataframe
labels = df.label
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [88]:
# Splitting the dataset into training and testing sets
x_train,x_test,y_train,y_test=train_test_split(df['text'], labels, test_size=0.2, random_state=7)

# df['text'] is the feature set, the model will use this text data to learn patterns (get trained)
# labels extracted previously is the target variable (fake or real news) which the model will predict
# test size is set to 0.2 which means 20% of the data will be used for testing and 80% will be used for training
# the seed (random_state) is set to 7 so we can get the same training and testing sets when using this seed
# the funciton returns 4 subsets: 
# 80% of the text data for training the model (x_train)
# 20% of the text data used for testing the model (x_test)
# Corresponding labels for the training data (y_train)
# Corresponding labels for the testing data (y_test)


In [96]:
# Initialzing the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7, ngram_range=(1, 2))
# Based on stop_words='english, the common english words like 'and' or 'the' will not be considered when trying to distinguish if the news is fake or real
# Based on max_df=0.7, words that appear in more than 70% of the documents will be ignored.
# (because words with high document frequency (e.g., "news," "article") are less informative)
# By using ngram_range=(1, 2) the vectorizer considers both unigrams (individual words) and bigrams (two consecutive words) as features

# Fit and transform train set and test set
# fit_transform function first learns the parameters from the data and then transforms the data into its numerical representation.
# The vocabulary (set of unique words) and their corresponding IDF scores are learned from the training data and stored in the vectorizer object.
tfidf_train = tfidf_vectorizer.fit_transform(x_train)
# transform function applies the previously learned parameters (from fit_transform) to new data without re-learning them
# Here the vectorizer does not learn anything new. It applies the same vocabulary and IDF values to transform x_test into a numerical matrix.
# For words in the test data that are in the vocabulary, their TF-IDF scores are computed using the IDF values from the training data.
tfidf_test = tfidf_vectorizer.transform(x_test)



In [97]:
# Initializing a PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train, y_train)

# Predicting on the test set and calculating the accuracy of the model
y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy of the model: {round(score*100, 2)}%')

Accuracy of the model: 93.53%


In [101]:
model = LogisticRegression(max_iter=1000, C=1.0, random_state=7)
model.fit(tfidf_train, y_train)
y_pred = model.predict(tfidf_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

cm = confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL'])
print(cm)

NameError: name 'LogisticRegression' is not defined

In [83]:
# Printing a confusion matrix
# The result will show how many of the news were correctly predicted as fake or real
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL']) #first row is for the ones labeled as 'fake' and second row is for 'real' label


array([[588,  50],
       [ 41, 588]])