In [1]:
!unzip /content/Fake\ news\ dataset.zip

Archive:  /content/Fake news dataset.zip
  inflating: Fake.csv                
  inflating: True.csv                


In [2]:
import pandas as pd
import numpy as np

In [15]:
fake = pd.read_csv("/content/Fake.csv")
true = pd.read_csv("/content/True.csv")

# add a "label" column in both datasets
fake["label"] = 0  # 0 = fake
true["label"] = 1  # 1 = real

df = pd.concat([fake, true], ignore_index=True) # merge fake and true datasets to one dataset
df["text_all"] = (df["title"].fillna("") + " " + df["text"].fillna(""))

print("Class counts:\n", df["label"].value_counts())

Class counts:
 label
0    23481
1    21417
Name: count, dtype: int64


In [16]:
df.head()

Unnamed: 0,title,text,subject,date,label,text_all
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0,Donald Trump Sends Out Embarrassing New Year’...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0,Drunk Bragging Trump Staffer Started Russian ...
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0,Sheriff David Clarke Becomes An Internet Joke...
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0,Trump Is So Obsessed He Even Has Obama’s Name...
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0,Pope Francis Just Called Out Donald Trump Dur...


In [20]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load("en_core_web_sm")
nlp.vocab["not"].is_stop = False

def clean_texts(docs):
    clean = []
    for doc in nlp.pipe(docs, batch_size=100, n_process=2): # used .pipe for synchronous batch processing
        tokens = [token.lemma_.lower() for token in doc if token.is_alpha and not token.is_stop and not token.is_punct]
        clean.append(" ".join(tokens))
    return clean

In [21]:
df["clean_text"] = clean_texts(df["text_all"])
df.head()

Unnamed: 0,title,text,subject,date,label,text_all,clean_text
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0,Donald Trump Sends Out Embarrassing New Year’...,donald trump send embarrassing new year eve me...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0,Drunk Bragging Trump Staffer Started Russian ...,drunk bragging trump staffer start russian col...
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0,Sheriff David Clarke Becomes An Internet Joke...,sheriff david clarke internet joke threaten po...
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0,Trump Is So Obsessed He Even Has Obama’s Name...,trump obsessed obama code website images chris...
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0,Pope Francis Just Called Out Donald Trump Dur...,pope francis call donald trump christmas speec...


In [22]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df["clean_text"], df["label"], test_size=0.20, random_state=42, stratify=df["label"])

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    ngram_range=(1,2), # for unigrams and bigrams, since adding bigrams give better context.
    max_features=50000,
)
Xtr = tfidf.fit_transform(x_train) # vectorized x_train
Xte = tfidf.transform(x_test) #vectorized x_test

In [26]:
from sklearn.svm import LinearSVC

svc_model = LinearSVC()
svc_model.fit(Xtr, y_train)

In [27]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

pred = svc_model.predict(Xte)
acc = accuracy_score(y_test, pred)
f1 = f1_score(y_test, pred)

print(f"\nAccuracy: {acc:.4f}, F1: {f1:.4f}\n")
print(classification_report(y_test, pred))


Accuracy: 0.9965, F1: 0.9964

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4696
           1       1.00      1.00      1.00      4284

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980

