In [None]:
# downloading spacy model
!python -m spacy download 'en_core_web_lg'

In [2]:
# imports
import pandas as pd
import numpy as np
import spacy
from gensim.parsing.preprocessing import strip_punctuation
from gensim.utils import deaccent
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# loading spacy model
nlp = spacy.load('en_core_web_lg')

In [4]:
# loading data
df = pd.read_csv('https://raw.githubusercontent.com/Rohit-Rannavre/Data-Science-2023/main/Beginner%20Data%20Science%20Projects/fake_real_news_dataset.csv')
df.head()

Unnamed: 0,text,label
0,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,It's primary day in New York and front-runners...,REAL


In [5]:
# getting info of data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2422 entries, 0 to 2421
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2422 non-null   object
 1   label   2422 non-null   object
dtypes: object(2)
memory usage: 38.0+ KB


In [6]:
# looking at data
df['text'][1]

'Google Pinterest Digg Linkedin Reddit Stumbleupon Print Delicious Pocket Tumblr \nThere are two fundamental truths in this world: Paul Ryan desperately wants to be president. And Paul Ryan will never be president. Today proved it. \nIn a particularly staggering example of political cowardice, Paul Ryan re-re-re-reversed course and announced that he was back on the Trump Train after all. This was an aboutface from where he was a few weeks ago. He had previously declared he would not be supporting or defending Trump after a tape was made public in which Trump bragged about assaulting women. Suddenly, Ryan was appearing at a pro-Trump rally and boldly declaring that he already sent in his vote to make him President of the United States. It was a surreal moment. The figurehead of the Republican Party dosed himself in gasoline, got up on a stage on a chilly afternoon in Wisconsin, and lit a match. . @SpeakerRyan says he voted for @realDonaldTrump : “Republicans, it is time to come home” ht

In [7]:
# getting value count of target feature
df['label'].value_counts()

REAL    1212
FAKE    1210
Name: label, dtype: int64

In [8]:
# mapping values in target feature
df['label'] = df['label'].map({'FAKE':0, 'REAL':1})
df['label'].value_counts()

1    1212
0    1210
Name: label, dtype: int64

In [9]:
# data preprocessing; punctuation removal, deaccent
df['text'] = df['text'].apply(strip_punctuation)
df['text'] = df['text'].apply(deaccent)
df.head()

Unnamed: 0,text,label
0,Daniel Greenfield a Shillman Journalism Fello...,0
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,0
2,U S Secretary of State John F Kerry said Mon...,1
3,— Kaydee King KaydeeKing November 9 2016 Th...,0
4,It s primary day in New York and front runners...,1


In [10]:
# data preprocessing; punctuations if any, extra spaces, digits, lemmatization 
def preprocess(text):
  doc = nlp(text)
  tokens = []
  for token in doc:
    if token.is_punct or token.is_space or token.is_digit:
      continue
    tokens.append(token.lemma_)
  return " ".join(tokens)

In [11]:
# lemmatization
df['processed_text'] = df['text'].apply(preprocess)
df.head()

Unnamed: 0,text,label,processed_text
0,Daniel Greenfield a Shillman Journalism Fello...,0,Daniel Greenfield a Shillman Journalism Fellow...
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,0,Google Pinterest Digg Linkedin Reddit Stumbleu...
2,U S Secretary of State John F Kerry said Mon...,1,U S Secretary of State John F Kerry say Monday...
3,— Kaydee King KaydeeKing November 9 2016 Th...,0,Kaydee King KaydeeKing November the lesson fro...
4,It s primary day in New York and front runners...,1,it s primary day in New York and front runner ...


In [12]:
# getting spacy vectors; GloVe
df['vector'] = df['processed_text'].apply(lambda x: nlp(x).vector)
df.head() 

Unnamed: 0,text,label,processed_text,vector
0,Daniel Greenfield a Shillman Journalism Fello...,0,Daniel Greenfield a Shillman Journalism Fellow...,"[-1.8394325, 1.9709957, -2.3779461, 0.77577513..."
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,0,Google Pinterest Digg Linkedin Reddit Stumbleu...,"[-1.8273462, 1.986787, -2.36353, 0.902818, 3.4..."
2,U S Secretary of State John F Kerry said Mon...,1,U S Secretary of State John F Kerry say Monday...,"[-1.9870148, 1.7825794, -2.0632582, 0.5251661,..."
3,— Kaydee King KaydeeKing November 9 2016 Th...,0,Kaydee King KaydeeKing November the lesson fro...,"[-1.9716038, 1.3209466, -1.9036548, 0.77425534..."
4,It s primary day in New York and front runners...,1,it s primary day in New York and front runner ...,"[-1.9667718, 2.1365762, -2.5525568, 1.3276621,..."


In [13]:
# creating X & y
X = df['vector']
y = df['label']

In [14]:
# splitting data into training, testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=18)

In [15]:
# checking shape of training, testing sets
print('X_train: ', X_train.shape)
print('X_test: ', X_test.shape)
print('y_train: ', y_train.shape)
print('y_test: ', y_test.shape)

X_train:  (1937,)
X_test:  (485,)
y_train:  (1937,)
y_test:  (485,)


In [16]:
# reshaping training, testing sets
X_train = np.stack(X_train)
X_test = np.stack(X_test)
print('X_train after reshaping: ', X_train.shape)
print('X_test after reshaping: ', X_test.shape)

X_train after reshaping:  (1937, 300)
X_test after reshaping:  (485, 300)


In [17]:
# model training
model = XGBClassifier(random_state=20)
model.fit(X_train, y_train)

In [18]:
# model evaluation
print('Accuracy: ', round(accuracy_score(y_test, model.predict(X_test)) * 100, 2))

Accuracy:  82.68


In [19]:
# printing clf report
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.86      0.78      0.82       242
           1       0.80      0.88      0.84       243

    accuracy                           0.83       485
   macro avg       0.83      0.83      0.83       485
weighted avg       0.83      0.83      0.83       485

