In [4]:
!python -m spacy download en_core_web_lg -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [70]:
import pandas as pd
import numpy as np

import spacy

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

## Initializing the NLP Object

In [35]:
nlp = spacy.load("en_core_web_lg")

nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

## Loading the Dataset

In [8]:
from pathlib import Path
import zipfile


zip_path = Path("/content/fake_news.zip")
dest_dir = Path("/content")

if not dest_dir.is_file():
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        print(f"[INFO] Unzipping dataset `{zip_path}` to `{dest_dir}`...")
        zip_ref.extractall(dest_dir)

print(f"[INFO] Dataset succesfully downloaded to `{dest_dir}`..")

[INFO] Unzipping dataset `/content/fake_news.zip` to `/content`...
[INFO] Dataset succesfully downloaded to `/content`..


## Preprocessing the Dataset

In [44]:
# Lowering the number of sampling in the database because otherwise to converting to word embeddings will take a while
df_true = pd.read_csv(dest_dir / "True.csv").sample(2000)
df_fake = pd.read_csv(dest_dir / "Fake.csv").sample(2000)

df_true["label"] = df_true["text"].apply(lambda x: 0)
df_fake["label"] = df_fake["text"].apply(lambda x: 1)

df_true = df_true[["text", "label"]]
df_fake = df_fake[["text", "label"]]

df = pd.concat([df_true, df_fake], ignore_index=True)

df.head(3)

Unnamed: 0,text,label
0,PARIS (Reuters) - France warned Iraq s Kurdish...,0
1,VIENNA (Reuters) - The United States is pushin...,0
2,WASHINGTON (Reuters) - Britain s top internal ...,0


In [45]:
print(f"Real News:\n{df_true.value_counts('label')}")

Real News:
label
0    2000
dtype: int64


In [46]:
print(f"Fake News:\n{df_fake.value_counts('label')}")

Fake News:
label
1    2000
dtype: int64


In [47]:
print(f"News: \n{df.value_counts('label')}")

News: 
label
0    2000
1    2000
dtype: int64


## Converting Text Tokens into Vectors

In [48]:
df["text_embedding"] = df["text"].apply(lambda x: nlp(x).vector)

In [50]:
df.head(3)

Unnamed: 0,text,label,text_embedding
0,PARIS (Reuters) - France warned Iraq s Kurdish...,0,"[-2.4732926, 0.6465163, -1.6472088, 0.88335335..."
1,VIENNA (Reuters) - The United States is pushin...,0,"[-1.409202, 1.269841, -1.9191968, 0.56275773, ..."
2,WASHINGTON (Reuters) - Britain s top internal ...,0,"[-1.530828, 0.027151464, -1.8907039, 0.5134016..."


## Splitting the Dataset into Training and Testing Sets

In [58]:
test_prop = 0.2

x_train, x_test, y_train, y_test = train_test_split(
    df["text_embedding"].values,
    df["label"].values,
    test_size = test_prop,
    stratify = df["label"]
)

print(len(x_train), len(y_train), len(x_test), len(y_test))

3200 3200 800 800


In [59]:
x_test.shape

(800,)

In [64]:
x_test[:3]

array([array([-1.6990374 ,  0.18641396, -1.4847157 ,  1.1159774 ,  4.6885962 ,
               0.3971445 ,  0.2959907 ,  3.0855658 ,  0.26250097, -1.0339226 ,
               4.5892897 ,  1.3528904 , -3.0136588 ,  0.06644724,  0.23824485,
               2.157219  ,  1.2216797 ,  1.1496644 , -0.7026303 , -1.3353778 ,
               1.4537326 , -1.0766957 , -1.2396569 ,  0.17876521,  0.9904647 ,
              -1.3598564 , -2.1152465 , -1.0778198 , -0.01128398,  0.25166872,
               0.9824257 ,  0.17000568, -0.49195516, -1.5938494 , -2.719854  ,
              -1.3060285 , -1.0875258 ,  1.0797911 , -0.17999567,  0.25881505,
               0.03923338, -0.00992599, -0.3811454 ,  0.2707873 , -1.4720716 ,
               0.71864104, -0.11944588, -1.863133  ,  0.20205991,  1.5312563 ,
              -1.5083288 ,  1.5934111 ,  0.27874517, -4.0916333 , -0.14249617,
               0.8220975 ,  0.24579127,  0.27426612,  0.51664245,  0.14226678,
              -0.17370103, -1.1802952 ,  0.21101147,

As you see the format of the inputs needs correcting.

In [69]:
# Converting the inner numpy arrays into lists: https://numpy.org/doc/stable/reference/generated/numpy.stack.html
x_train = np.stack(x_train)
x_test = np.stack(x_test)

x_test[:3]

array([[-1.69903743e+00,  1.86413959e-01, -1.48471570e+00,
         1.11597741e+00,  4.68859625e+00,  3.97144496e-01,
         2.95990705e-01,  3.08556581e+00,  2.62500972e-01,
        -1.03392255e+00,  4.58928967e+00,  1.35289037e+00,
        -3.01365876e+00,  6.64472356e-02,  2.38244846e-01,
         2.15721893e+00,  1.22167969e+00,  1.14966440e+00,
        -7.02630281e-01, -1.33537781e+00,  1.45373261e+00,
        -1.07669568e+00, -1.23965693e+00,  1.78765208e-01,
         9.90464687e-01, -1.35985637e+00, -2.11524653e+00,
        -1.07781982e+00, -1.12839779e-02,  2.51668721e-01,
         9.82425690e-01,  1.70005679e-01, -4.91955161e-01,
        -1.59384942e+00, -2.71985412e+00, -1.30602849e+00,
        -1.08752584e+00,  1.07979107e+00, -1.79995671e-01,
         2.58815050e-01,  3.92333791e-02, -9.92598943e-03,
        -3.81145388e-01,  2.70787299e-01, -1.47207165e+00,
         7.18641043e-01, -1.19445875e-01, -1.86313295e+00,
         2.02059910e-01,  1.53125632e+00, -1.50832880e+0

## Naive Bayes

Naive Bayes model doesn't accept `negative` inputs, but many elements inside of the embeddings are negative. In order to use this model we need to scale those embeddings. A way for doing that is using `MinMaxScaler`: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

In [73]:
# Initalizing the MinMaxScaler Object
scaler = MinMaxScaler()

x_train_transformed = scaler.fit_transform(x_train)
x_test_transformed = scaler.transform(x_test)

In [75]:
x_train_transformed[:3]

array([[0.18352261, 0.5127424 , 0.20652477, 0.70823985, 0.8019486 ,
        0.41418612, 0.31868377, 0.84970164, 0.7667904 , 0.40330094,
        0.69066554, 0.6691419 , 0.29070067, 0.6059663 , 0.2962703 ,
        0.71566564, 0.2981282 , 0.500262  , 0.27463293, 0.36874685,
        0.71196973, 0.45798212, 0.3742614 , 0.6350716 , 0.635614  ,
        0.29736978, 0.37792507, 0.34352607, 0.48819393, 0.34191057,
        0.6465447 , 0.5131383 , 0.51963025, 0.36597848, 0.20970091,
        0.60689956, 0.26103795, 0.52191126, 0.39699003, 0.2754802 ,
        0.3336203 , 0.5989372 , 0.7099486 , 0.3908512 , 0.53400004,
        0.56066906, 0.49912584, 0.55829114, 0.34140828, 0.80085087,
        0.2715939 , 0.6860648 , 0.37225515, 0.34896326, 0.5528092 ,
        0.53106487, 0.7104519 , 0.37570125, 0.45207506, 0.38405535,
        0.31356955, 0.2496711 , 0.7448099 , 0.31895572, 0.8013603 ,
        0.5878582 , 0.55752957, 0.4616373 , 0.5603721 , 0.5447116 ,
        0.74368924, 0.26407415, 0.3646064 , 0.53

In [76]:
# Initializing the Model
clf_nb = MultinomialNB()

clf_nb.fit(x_train_transformed, y_train)

print(classification_report(y_test, clf_nb.predict(x_test_transformed)))

              precision    recall  f1-score   support

           0       0.82      0.93      0.87       400
           1       0.92      0.80      0.86       400

    accuracy                           0.87       800
   macro avg       0.87      0.87      0.87       800
weighted avg       0.87      0.87      0.87       800



## KNN

Because similar words have similar vector representation KNN will work fantastically for this task.

In [79]:
clf_knn = KNeighborsClassifier(n_neighbors=5, metric="euclidean")

clf_knn.fit(x_train_transformed, y_train)

print(classification_report(y_test, clf_knn.predict(x_test_transformed)))

              precision    recall  f1-score   support

           0       0.93      0.96      0.95       400
           1       0.96      0.93      0.94       400

    accuracy                           0.94       800
   macro avg       0.95      0.95      0.94       800
weighted avg       0.95      0.94      0.94       800



## Random Forest

In [78]:
clf_rf = RandomForestClassifier()

clf_rf.fit(x_train_transformed, y_train)

print(classification_report(y_test, clf_rf.predict(x_test_transformed)))

              precision    recall  f1-score   support

           0       0.93      0.96      0.95       400
           1       0.96      0.93      0.95       400

    accuracy                           0.95       800
   macro avg       0.95      0.95      0.95       800
weighted avg       0.95      0.95      0.95       800

