# Fake News Classification



In [21]:
# Basic libraries
import nltk
import pandas as pd
import csv
import re
import string
import gensim
# NLTK utils
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
# Bag of words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Classification stuff
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from nltk.stem import PorterStemmer

And download these if we haven't already:

In [22]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tobys\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tobys\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tobys\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

Util function for part of speech tagging for lemmatisation:

In [23]:
# Function originally from: https://www.programcreek.com/python/?CodeExample=get%20wordnet%20pos
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

Now lets load and look at our data:

In [24]:
df = pd.read_csv('news.csv')
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


Delete the columns `Unnamed`

In [25]:
# Drop the unnecessary columns
df = df.drop('Unnamed: 0', axis=1)
df

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...
6330,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [26]:
# Create a new column with the news title and text
df['news'] = df['title'] + ' ' + df['text']
df

Unnamed: 0,title,text,label,news
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,You Can Smell Hillary’s Fear Daniel Greenfield...
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,Watch The Exact Moment Paul Ryan Committed Pol...
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,Kerry to go to Paris in gesture of sympathy U....
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,Bernie supporters on Twitter erupt in anger ag...
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,The Battle of New York: Why This Primary Matte...
...,...,...,...,...
6330,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL,State Department says it can't find emails fro...
6331,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
6332,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE,Anti-Trump Protesters Are Tools of the Oligarc...
6333,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL,"In Ethiopia, Obama seeks progress on peace, se..."


Because the dataset is so big, thus, I select half of dataset to test

In [None]:
# # Selecting half of the dataset
# df = df.iloc[ : (len(df)//2)]
# df

In [28]:
# Inorder to analyze the data, and check fake or real news, I make fake is 0 and real is 1
df['label'] = df['label'].replace(['FAKE'], 0)
df['label'] = df['label'].replace(['REAL'], 1)
df

Unnamed: 0,title,text,label,news
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0,You Can Smell Hillary’s Fear Daniel Greenfield...
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0,Watch The Exact Moment Paul Ryan Committed Pol...
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1,Kerry to go to Paris in gesture of sympathy U....
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0,Bernie supporters on Twitter erupt in anger ag...
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1,The Battle of New York: Why This Primary Matte...
...,...,...,...,...
6330,State Department says it can't find emails fro...,The State Department told the Republican Natio...,1,State Department says it can't find emails fro...
6331,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,0,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
6332,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,0,Anti-Trump Protesters Are Tools of the Oligarc...
6333,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",1,"In Ethiopia, Obama seeks progress on peace, se..."


#### Stop words

In [29]:
#remove English stop words
def remove_stopwords(news):
    # get a list of stop words from nltk and create a list of custom stopwords
    stop_words = stopwords.words('english')
    # convert the news to lowercase and split
    news = news.lower()
    news = news.split()
    # create a list of words from the text excluding the stop words
    return " ".join([word for word in news if word not in stop_words]) 

In [30]:
# Create a new column to store the filtered text
df['filtered_news'] = df['news'].apply(remove_stopwords)
df


Unnamed: 0,title,text,label,news,filtered_news
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0,You Can Smell Hillary’s Fear Daniel Greenfield...,"smell hillary’s fear daniel greenfield, shillm..."
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0,Watch The Exact Moment Paul Ryan Committed Pol...,watch exact moment paul ryan committed politic...
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1,Kerry to go to Paris in gesture of sympathy U....,kerry go paris gesture sympathy u.s. secretary...
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0,Bernie supporters on Twitter erupt in anger ag...,bernie supporters twitter erupt anger dnc: 'we...
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1,The Battle of New York: Why This Primary Matte...,battle new york: primary matters primary day n...
...,...,...,...,...,...
6330,State Department says it can't find emails fro...,The State Department told the Republican Natio...,1,State Department says it can't find emails fro...,state department says can't find emails clinto...
6331,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,0,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,‘p’ pbs stand ‘plutocratic’ ‘pentagon’ ‘p’ pbs...
6332,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,0,Anti-Trump Protesters Are Tools of the Oligarc...,anti-trump protesters tools oligarchy : inform...
6333,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",1,"In Ethiopia, Obama seeks progress on peace, se...","ethiopia, obama seeks progress peace, security..."


##### Split dataset

In [31]:
# Code from https://git.arts.ac.uk/tbroad/NLP-23-24/blob/main/Week-6-Classification/mb-classification-with-bow.ipynb
# Select 20% of the data for testing, and random state is 42
X_train, X_test, y_train, y_test = train_test_split(df['filtered_news'], df['label'], test_size=0.2, random_state=42)

In [32]:
X_train = X_train.apply(remove_stopwords)
X_test = X_test.apply(remove_stopwords)

#### Bag of words features

In [33]:
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)
# print(f'Our bag of words for the whole dataset is a matrix of the shape and size {x.shape}')

#### Train MultinomialNB classifier

In [34]:
# Code from https://git.arts.ac.uk/tbroad/NLP-23-24/blob/main/Week-6-Classification/mb-classification-with-bow.ipynb
classifier = MultinomialNB()
classifier.fit(X_train_bow.toarray(), y_train)

#### Test MultinomialNB classifier

In [35]:
# Predicting the Test set results
y_pred = classifier.predict(X_test_bow.toarray())

# Calculating Accuracy and creating Classification Report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.904498816101026
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.87      0.90       628
           1       0.88      0.93      0.91       639

    accuracy                           0.90      1267
   macro avg       0.91      0.90      0.90      1267
weighted avg       0.91      0.90      0.90      1267



#### Train SVC classifier

In [36]:
from sklearn.svm import SVC
# Code from https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
classifier = SVC(kernel='linear')
# Code from https://git.arts.ac.uk/tbroad/NLP-23-24/blob/main/Week-6-Classification/mb-classification-with-bow.ipynb
classifier.fit(X_train_bow.toarray(), y_train)

#### Test SVC classifier

In [37]:
# Code from https://git.arts.ac.uk/tbroad/NLP-23-24/blob/main/Week-6-Classification/mb-classification-with-bow.ipynb
y_pred = classifier.predict(X_test_bow.toarray())
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.9116022099447514
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.90      0.91       628
           1       0.91      0.92      0.91       639

    accuracy                           0.91      1267
   macro avg       0.91      0.91      0.91      1267
weighted avg       0.91      0.91      0.91      1267



#### Train logistic regression classifier

In [38]:
# Code from https://www.kaggle.com/code/gon213/fake-news-detect-by-any-classifier-acc-90
# Check the percentage of the label column
df['label'].value_counts(normalize=True)


label
1    0.500552
0    0.499448
Name: proportion, dtype: float64

In [39]:
from sklearn.linear_model import LogisticRegression
# Code from https://towardsdatascience.com/logistic-regression-using-python-sklearn-numpy-mnist-handwriting-recognition-matplotlib-a6b31e2b166a
classifier = LogisticRegression()
# Code from https://git.arts.ac.uk/tbroad/NLP-23-24/blob/main/Week-6-Classification/mb-classification-with-bow.ipynb
classifier.fit(X_train_bow.toarray(), y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### Test logistic regression classifier

In [40]:
# Code from https://git.arts.ac.uk/tbroad/NLP-23-24/blob/main/Week-6-Classification/mb-classification-with-bow.ipynb
y_pred = classifier.predict(X_test_bow.toarray())
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.9337016574585635
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.93      0.93       628
           1       0.94      0.93      0.93       639

    accuracy                           0.93      1267
   macro avg       0.93      0.93      0.93      1267
weighted avg       0.93      0.93      0.93      1267



### Citation List
#### Dataset
Kumar, R. (2023). Fake News Prediction Dataset. [online] www.kaggle.com. Available at: https://www.kaggle.com/datasets/rajatkumar30/fake-news [Accessed 5 Dec. 2023].

#### Code

Fiebrink, R. and Broad, T. (2023). Week 6-Classification. Build Software better, Together. [online] GitHub. Available at: https://git.arts.ac.uk/tbroad/NLP-23-24/blob/main/Week-6-Classification/mb-classification-with-bow.ipynb [Accessed 8 Dec. 2023].

Galarnyk, M. (2020). Logistic Regression Using Python (scikit-learn). [online] Medium. Available at: https://towardsdatascience.com/logistic-regression-using-python-sklearn-numpy-mnist-handwriting-recognition-matplotlib-a6b31e2b166a.

scikit-learn developers (2019). sklearn.svm.SVC — scikit-learn 0.22 Documentation. [online] Scikit-learn.org. Available at: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html.

