In [1]:
# imports
import numpy as np
np.random.seed(9)
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim import downloader
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import strip_punctuation
from gensim.utils import simple_preprocess 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

In [2]:
# loading W2V-GoogleNews pretrained model
model = KeyedVectors.load_word2vec_format(r'C:\Users\Rohit\Downloads\GoogleNews-vectors-negative300.bin', binary=True, limit=100000)

In [3]:
# loading dataset
df = pd.read_csv(r'C:\Users\Rohit\Downloads\fake_real_news_dataset.csv')
df.head()

Unnamed: 0,text,label
0,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,It's primary day in New York and front-runners...,REAL


In [4]:
# getting info of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2422 entries, 0 to 2421
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2422 non-null   object
 1   label   2422 non-null   object
dtypes: object(2)
memory usage: 38.0+ KB


In [5]:
# checking whether dataset is imbalanced
df['label'].value_counts()

REAL    1212
FAKE    1210
Name: label, dtype: int64

In [6]:
# encoding target feature
df['label_encoded'] = df['label'].map({'FAKE':0, 'REAL':1})
df.head()

Unnamed: 0,text,label,label_encoded
0,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,0
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,0
2,U.S. Secretary of State John F. Kerry said Mon...,REAL,1
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,0
4,It's primary day in New York and front-runners...,REAL,1


In [7]:
# removing punctuations
df['processed'] = df['text'].apply(strip_punctuation)
df.head()

Unnamed: 0,text,label,label_encoded,processed
0,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,0,Daniel Greenfield a Shillman Journalism Fello...
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,0,Google Pinterest Digg Linkedin Reddit Stumbleu...
2,U.S. Secretary of State John F. Kerry said Mon...,REAL,1,U S Secretary of State John F Kerry said Mon...
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,0,— Kaydee King KaydeeKing November 9 2016 Th...
4,It's primary day in New York and front-runners...,REAL,1,It s primary day in New York and front runners...


In [8]:
# text preprocessing; lower casing, tokenization
df['processed'] = df['processed'].apply(simple_preprocess)
df.head()

Unnamed: 0,text,label,label_encoded,processed
0,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,0,"[daniel, greenfield, shillman, journalism, fel..."
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,0,"[google, pinterest, digg, linkedin, reddit, st..."
2,U.S. Secretary of State John F. Kerry said Mon...,REAL,1,"[secretary, of, state, john, kerry, said, mond..."
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,0,"[kaydee, king, kaydeeking, november, the, less..."
4,It's primary day in New York and front-runners...,REAL,1,"[it, primary, day, in, new, york, and, front, ..."


In [9]:
# removing stopwords
stopwords = stopwords.words('english')
def remove_stopwords(text):
    removed_stopwords = [word for word in text if word not in stopwords]
    return removed_stopwords
df['processed'] = df['processed'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,text,label,label_encoded,processed
0,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,0,"[daniel, greenfield, shillman, journalism, fel..."
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,0,"[google, pinterest, digg, linkedin, reddit, st..."
2,U.S. Secretary of State John F. Kerry said Mon...,REAL,1,"[secretary, state, john, kerry, said, monday, ..."
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,0,"[kaydee, king, kaydeeking, november, lesson, t..."
4,It's primary day in New York and front-runners...,REAL,1,"[primary, day, new, york, front, runners, hill..."


In [10]:
# lemmatization
lemma = WordNetLemmatizer()
def lemmatization(text):
    lemmatized_text = [lemma.lemmatize(word) for word in text]
    return lemmatized_text
df['processed'] = df['processed'].apply(lambda x: lemmatization(x))
df.head()

Unnamed: 0,text,label,label_encoded,processed
0,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,0,"[daniel, greenfield, shillman, journalism, fel..."
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,0,"[google, pinterest, digg, linkedin, reddit, st..."
2,U.S. Secretary of State John F. Kerry said Mon...,REAL,1,"[secretary, state, john, kerry, said, monday, ..."
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,0,"[kaydee, king, kaydeeking, november, lesson, t..."
4,It's primary day in New York and front-runners...,REAL,1,"[primary, day, new, york, front, runner, hilla..."


In [11]:
# getting mean vectors from pretrained model 
df['vector'] = df['processed'].apply(model.get_mean_vector)
df.head()

Unnamed: 0,text,label,label_encoded,processed,vector
0,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,0,"[daniel, greenfield, shillman, journalism, fel...","[0.026954504, 0.013023174, 0.008259787, 0.0193..."
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,0,"[google, pinterest, digg, linkedin, reddit, st...","[0.018229537, 0.011773695, 0.009492101, 0.0252..."
2,U.S. Secretary of State John F. Kerry said Mon...,REAL,1,"[secretary, state, john, kerry, said, monday, ...","[0.00014865129, 0.016942285, 0.018758478, 0.01..."
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,0,"[kaydee, king, kaydeeking, november, lesson, t...","[0.022834618, 0.0015494938, 0.01016998, 0.0222..."
4,It's primary day in New York and front-runners...,REAL,1,"[primary, day, new, york, front, runner, hilla...","[0.018750893, 0.018382516, 0.008258443, -0.004..."


In [12]:
# creating X & y
X = df['vector'].values
y = df['label_encoded']

In [13]:
# splitting dataset into training, testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

In [14]:
# checking shape of training sets
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (1937,)
Shape of X_test:  (485,)


In [15]:
# reshaping training sets
X_train = np.stack(X_train)
X_test =  np.stack(X_test)
print("Shape of X_train after reshaping: ", X_train.shape)
print("Shape of X_test after reshaping: ", X_test.shape)

Shape of X_train after reshaping:  (1937, 300)
Shape of X_test after reshaping:  (485, 300)


In [16]:
# model training
model = GradientBoostingClassifier(random_state=18)
model.fit(X_train, y_train)

In [17]:
# plotting clf report
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87       242
           1       0.89      0.83      0.86       243

    accuracy                           0.86       485
   macro avg       0.86      0.86      0.86       485
weighted avg       0.86      0.86      0.86       485

