In [1]:
from collections import Counter
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier, plot_tree

# set random seeds for reproducibility
random.seed(0)
np.random.seed(0)

In [2]:
real_lines = [line.strip() for line in open('clean_real.txt', 'r').readlines()]
print('# real headlines', len(real_lines))
fake_lines = [line.strip() for line in open('clean_fake.txt', 'r').readlines()]
print('# fake headlines', len(fake_lines))
real_words = ' '.join(real_lines).split(' ')
fake_words = ' '.join(fake_lines).split(' ')
all_words = list(set(real_words).union(set(fake_words)))

# create shuffled 70/15/15 train/validation/test datasets
real_label, fake_label = (1, 0)
X = real_lines + fake_lines
y = len(real_lines) * [real_label] + len(fake_lines) * [fake_label]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=0)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, shuffle=True, random_state=0)

# real headlines 1968
# fake headlines 1298


In [3]:
vectorizer = CountVectorizer()
counts_train = vectorizer.fit_transform(X_train)
counts_test = vectorizer.transform(X_test)
counts_train.shape, counts_test.shape

((2286, 4750), (490, 4750))

In [4]:
nb = MultinomialNB()
nb = nb.fit(counts_train, y_train)

In [5]:
preds = nb.predict(counts_test)
preds.shape

(490,)

In [6]:
def get_accuracy(y_hat: np.ndarray, y: np.ndarray):
    return np.sum(y_hat == y)/len(y)

In [8]:
acc = get_accuracy(preds, y_test)
assert abs(acc - accuracy_score(y_test, preds)) < 1e-8
print('accuracy = ', acc)

accuracy =  0.8306122448979592
