In [35]:
!python -m spacy download en_core_web_sm -q

2023-04-07 22:16:27.779310: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m39.4 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [44]:
import pandas as pd

import spacy

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [45]:
# Creating the NLP Object
nlp = spacy.load("en_core_web_sm")

nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

## Loading the Dataset

In [25]:
from pathlib import Path
import zipfile


zip_path = Path("/content/fake_news.zip")
dest_dir = Path("/content")

if not dest_dir.is_file():
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        print(f"[INFO] Unzipping dataset `{zip_path}` to `{dest_dir}`...")
        zip_ref.extractall(dest_dir)

print(f"[INFO] Dataset succesfully downloaded to `{dest_dir}`..")

[INFO] Unzipping dataset `/content/fake_news.zip` to `/content`...
[INFO] Dataset succesfully downloaded to `/content`..


## Understanding the Dataset

In [26]:
df_true = pd.read_csv(dest_dir / "True.csv")

print(df_true.shape)
df_true.head(3)

(21417, 4)


Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"


In [27]:
df_fake = pd.read_csv(dest_dir / "Fake.csv")

print(df_fake.shape)
df_fake.head(3)

(23481, 4)


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"


## Preprocessing the Dataset

In [31]:
df_true["label"] = df_true["text"].apply(lambda x: 0)
df_true = df_true[["text", "label"]].sample(min(df_true.shape[0], df_fake.shape[0]))

print(df_true.shape)
df_true.head(3)

(21417, 2)


Unnamed: 0,text,label
2583,ABOARD AIR FORCE ONE (Reuters) - U.S. Attorney...,0
19082,"RAQQA, Syria (Reuters) - U.S.-backed militias ...",0
11230,(Reuters) - Michigan would tap its $575 millio...,0


In [32]:
df_fake["label"] = df_fake["text"].apply(lambda x: 1)
df_fake = df_fake[["text", "label"]].sample(min(df_true.shape[0], df_fake.shape[0]))

print(df_fake.shape)
df_fake.head(3)

(21417, 2)


Unnamed: 0,text,label
3581,While Donald Trump may have bribed Carrier int...,1
21911,So a wannabe female bomber was a pre-school te...,1
2871,Donald Trump threw a petty hissy fit in the Wh...,1


In [33]:
df = pd.concat([df_true, df_fake], ignore_index=True)

df

Unnamed: 0,text,label
0,ABOARD AIR FORCE ONE (Reuters) - U.S. Attorney...,0
1,"RAQQA, Syria (Reuters) - U.S.-backed militias ...",0
2,(Reuters) - Michigan would tap its $575 millio...,0
3,WASHINGTON (Reuters) - U.S. House of Represent...,0
4,ABUJA (Reuters) - Nigeria has canceled its wee...,0
...,...,...
42829,In what can only be described as a proof of li...,1
42830,"While he s a lot wealthier, Donald Trump is no...",1
42831,The officials decision to keep information ci...,1
42832,"Jack Links, which is a very popular maker of b...",1


In [34]:
df.value_counts("label")

label
0    21417
1    21417
dtype: int64

## Creating the Training and Testing Sets (took 106 min)

In [42]:
test_prop = 0.2

x_train, x_test, y_train, y_test = train_test_split(
    df["text"].values,
    df["label"].values,
    test_size = test_prop,
    stratify=df["label"]
)

x_train_pre, x_test_pre, y_train_pre, y_test_pre = train_test_split(
    df["text"].apply(lambda x: " ".join(token.lemma_ for token in nlp(x) if (not token.is_stop) and (not token.is_punct))).values,
    df["label"].values,
    test_size = test_prop,
    stratify=df["label"]
)

print(len(x_train), len(y_train), len(x_test), len(y_test))
print(len(x_train_pre), len(y_train_pre), len(x_test_pre), len(y_test_pre))

34267 34267 8567 8567
34267 34267 8567 8567


## KNN (euclidean) Model

In [46]:
clf = Pipeline([
    ("vectorizer_bow", CountVectorizer(ngram_range=(1, 3))),
    ("KNN_euc", KNeighborsClassifier(n_neighbors=10, metric="euclidean"))
])

clf.fit(x_train, y_train)

print("KNN (euclidean):")
print(classification_report(y_test, clf.predict(x_test)))

KNN (euclidean):
              precision    recall  f1-score   support

           0       0.72      0.74      0.73      4284
           1       0.74      0.71      0.73      4283

    accuracy                           0.73      8567
   macro avg       0.73      0.73      0.73      8567
weighted avg       0.73      0.73      0.73      8567



In [47]:
clf = Pipeline([
    ("vectorizer_bow", CountVectorizer(ngram_range=(1, 3))),
    ("KNN_euc", KNeighborsClassifier(n_neighbors=10, metric="euclidean"))
])

clf.fit(x_train_pre, y_train_pre)

print("KNN (euclidean) Preprocessed:")
print(classification_report(y_test_pre, clf.predict(x_test_pre)))

KNN (euclidean) Preprocessed:
              precision    recall  f1-score   support

           0       0.82      0.38      0.52      4283
           1       0.60      0.92      0.72      4284

    accuracy                           0.65      8567
   macro avg       0.71      0.65      0.62      8567
weighted avg       0.71      0.65      0.62      8567



## KNN (cosine) Model

In [48]:
clf = Pipeline([
    ("vectorizer_bow", CountVectorizer(ngram_range=(1, 3))),
    ("KNN_cos", KNeighborsClassifier(n_neighbors=10, metric="cosine"))
])

clf.fit(x_train, y_train)

print("KNN (cosine):")
print(classification_report(y_test, clf.predict(x_test)))

KNN (cosine):
              precision    recall  f1-score   support

           0       0.88      0.47      0.62      4284
           1       0.64      0.93      0.76      4283

    accuracy                           0.70      8567
   macro avg       0.76      0.70      0.69      8567
weighted avg       0.76      0.70      0.69      8567



In [49]:
clf = Pipeline([
    ("vectorizer_bow", CountVectorizer(ngram_range=(1, 3))),
    ("KNN_cos", KNeighborsClassifier(n_neighbors=10, metric="cosine"))
])

clf.fit(x_train_pre, y_train_pre)

print("KNN (cosine) Preprocessed:")
print(classification_report(y_test_pre, clf.predict(x_test_pre)))

KNN (cosine) Preprocessed:
              precision    recall  f1-score   support

           0       0.78      0.99      0.87      4283
           1       0.99      0.72      0.83      4284

    accuracy                           0.86      8567
   macro avg       0.88      0.86      0.85      8567
weighted avg       0.88      0.86      0.85      8567



## Random Forest Model

In [50]:
clf = Pipeline([
    ("vectorizer_bow", CountVectorizer(ngram_range=(1, 3))),
    ("RFC", RandomForestClassifier())
])

clf.fit(x_train, y_train)

print("RFC:")
print(classification_report(y_test, clf.predict(x_test)))

RFC:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      4284
           1       0.99      0.98      0.99      4283

    accuracy                           0.99      8567
   macro avg       0.99      0.99      0.99      8567
weighted avg       0.99      0.99      0.99      8567



In [51]:
clf = Pipeline([
    ("vectorizer_bow", CountVectorizer(ngram_range=(1, 3))),
    ("RFC", RandomForestClassifier())
])

clf.fit(x_train_pre, y_train_pre)

print("RFC Preprocessed:")
print(classification_report(y_test_pre, clf.predict(x_test_pre)))

RFC Preprocessed:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      4283
           1       0.99      0.97      0.98      4284

    accuracy                           0.98      8567
   macro avg       0.98      0.98      0.98      8567
weighted avg       0.98      0.98      0.98      8567



## Naive Bayes Model

In [52]:
clf = Pipeline([
    ("vectorizer_bow", CountVectorizer(ngram_range=(1, 3))),
    ("Multi NB", MultinomialNB())
])

clf.fit(x_train, y_train)

print("NB:")
print(classification_report(y_test, clf.predict(x_test)))

NB:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      4284
           1       0.98      0.98      0.98      4283

    accuracy                           0.98      8567
   macro avg       0.98      0.98      0.98      8567
weighted avg       0.98      0.98      0.98      8567



In [53]:
clf = Pipeline([
    ("vectorizer_bow", CountVectorizer(ngram_range=(1, 3))),
    ("Multi NB", MultinomialNB())
])

clf.fit(x_train_pre, y_train_pre)

print("NB:")
print(classification_report(y_test_pre, clf.predict(x_test_pre)))

NB:
              precision    recall  f1-score   support

           0       0.95      0.98      0.97      4283
           1       0.98      0.95      0.96      4284

    accuracy                           0.97      8567
   macro avg       0.97      0.97      0.97      8567
weighted avg       0.97      0.97      0.97      8567

