# What Am I Reading?

In [None]:
import os
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import confusion_matrix
from urllib.request import urlopen

#### 1. Get Data

In [21]:
def collect_files(file, file_list):
    for root, _, files in os.walk(file):
        for file in files:
            file_list.append(os.path.join(root, file))
            
def score_model(model, x_test, y_test):
    y_pred = model.predict(x_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    
    print(f"True negatives: {tn} False positives: {fp}\nFalse negatives: {fn} True positives: {tp}\n")

In [22]:
real_arts = []
fake_arts = []

collect_files("NeuralNews/dataset/fake_arts", fake_arts)
collect_files("NeuralNews/dataset/real_arts", real_arts)

real_arts_train, real_arts_test = train_test_split(real_arts,test_size=0.25)
fake_arts_train, fake_arts_test = train_test_split(fake_arts,test_size=0.25)

training_set = real_arts_train + fake_arts_train
training_labels = ['real'] * len(real_arts_train) + ['fake'] * len(fake_arts_train) 

testing_set = real_arts_test + fake_arts_test
testing_labels = ['real'] * len(real_arts_test) + ['fake'] * len(fake_arts_test) 
     

In [23]:
# Creating the vectorizer

tfidf_vectorizer = TfidfVectorizer(input='filename', decode_error='ignore')
tfidf_vector = tfidf_vectorizer.fit_transform(training_set)

In [24]:
rfc = RandomForestClassifier(n_estimators=50, max_depth=25)
rfc.fit(tfidf_vector, training_labels)

In [25]:
test_arts_vector = tfidf_vectorizer.transform(testing_set)

In [26]:
print(f"Evaluating All Articles (RFC):")
score_model(rfc, test_arts_vector, testing_labels)

Evaluating All Articles (RFC):
True negatives: 6405 False positives: 1595
False negatives: 1340 True positives: 6660



In [27]:
count_vectorizer = CountVectorizer(input='filename', ngram_range=(3,3) ,decode_error='ignore')
count_vector = count_vectorizer.fit_transform(training_set)

In [28]:
rfc_trigram = RandomForestClassifier(n_estimators=50, max_depth=25)
rfc_trigram.fit(count_vector, training_labels)

In [29]:
test_arts_vector = count_vectorizer.transform(testing_set)

In [30]:
print(f"Evaluating All Articles (tri-gram approach):")
score_model(rfc_trigram, test_arts_vector, testing_labels)

Evaluating All Articles (tri-gram approach):
True negatives: 4819 False positives: 3181
False negatives: 1508 True positives: 6492



In [31]:
count_vectorizer = CountVectorizer(input='filename', ngram_range=(5,5) ,decode_error='ignore')
count_vector = count_vectorizer.fit_transform(training_set)

In [32]:
rfc_sevengram = RandomForestClassifier(n_estimators=50, max_depth=25)
rfc_sevengram.fit(count_vector, training_labels)

In [33]:
test_arts_vector = count_vectorizer.transform(testing_set)

In [34]:
print(f"Evaluating All Articles (5-gram approach):")
score_model(rfc_sevengram, test_arts_vector, testing_labels)

Evaluating All Articles (5-gram approach):
True negatives: 2355 False positives: 5645
False negatives: 389 True positives: 7611



In [None]:
bgc = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, n_jobs=-1)
bgc.fit(tfidf_vector, training_labels)

KeyboardInterrupt: 

In [None]:
print(f"Evaluating All Articles (Bagging Classifier):")
score_model(bgc, testing_set, testing_labels)