# Differents Naïve-Bayes model training

In [1]:
import numpy as np
import os
import json
import pandas as pd

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix

from pymongo import MongoClient

In [2]:
client = MongoClient('localhost', 27017)
db = client.TFE
collection = db.news

In [3]:
news = []
for new in collection.find({'type' : {'$in' : ['fake', 'reliable']}}):
    news.append((new['content'], new['type']))
tags = [x[1] for x in news]
texts = [x[0] for x in news]

In [7]:
#vectorizer = TfidfVectorizer()
#X = vectorizer.fit_transform(texts)
y = np.array(tags)

In [11]:
model = MultinomialNB()

train_accuracy = []
test_accuracy = []
kf = KFold(n_splits=3, shuffle = True)
for train_index, test_index in kf.split(y):  
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform([texts[i] for i in train_index])
    X_test = vectorizer.transform([texts[i] for i in test_index])
    y_train = y[train_index]
    y_test = y[test_index]
    model.fit(X_train, y_train)
    train_accuracy.append(model.score(X_train, y_train))
    test_accuracy.append(model.score(X_test, y_test))
    #print("Training accuracy : {}".format(model.score(X_train, y_train)))
    #print("Test accuracy : {}".format(model.score(X_test, y_test)))
    #print("Classification report for test set")
    #print(classification_report(y_test, model.predict(X_test)))
print("Train accuracy : {}".format(np.mean(train_accuracy)))
print("Test accuracy : {}".format(np.mean(test_accuracy)))
print(test_accuracy)

Train accuracy : 0.8981404616526975
Test accuracy : 0.89230703483496
[0.8921489126428308, 0.8924940036432207, 0.8922781882188283]


In [9]:
fake_directories = ['../../../Data/FakeNewsNet-master/Data/BuzzFeed/FakeNewsContent', '../../../Data/FakeNewsNet-master/Data/PolitiFact/FakeNewsContent']
real_directories = ['../../../Data/FakeNewsNet-master/Data/BuzzFeed/RealNewsContent', '../../../Data/FakeNewsNet-master/Data/PolitiFact/RealNewsContent']

fake_files_list = []
for fake_dir in fake_directories:
    for root, directory, files in os.walk(fake_dir):
        for name in files:
            fake_files_list.append(os.path.join(root, name))
real_files_list = []
for real_dir in real_directories:
    for root, directory, files in os.walk(real_dir):
        for name in files:
            real_files_list.append(os.path.join(root, name))
# Open the first file in order to retreive dictionary keys
with open(fake_files_list[0]) as f:
    j = json.loads(f.read())
keys = j.keys()
data = pd.DataFrame(columns=keys)
for file_name in fake_files_list:
    with open(file_name) as f:
        j = json.loads(f.read())
        j['type'] = 'fake'
        data = data.append(j, ignore_index=True)
for file_name in real_files_list:
    with open(file_name) as f:
        j = json.loads(f.read())
        j['type'] = 'reliable'
        data = data.append(j, ignore_index=True)     

In [10]:
new_text = data['text'].values
newy = data['type'].values

In [11]:
print(len(new_text))
print(len(newy))

422
422


In [None]:
X_test = None
X_train = None

In [12]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

In [13]:
model = MultinomialNB()
model.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [14]:
X_test = vectorizer.transform(new_text)
print("Test accuracy : {}".format(model.score(X_test, newy)))

Test accuracy : 0.6113744075829384


In [15]:
classification_report(newy, model.predict(X_test), labels=['fake', 'reliable'], output_dict = True)

{'fake': {'precision': 0.7526881720430108,
  'recall': 0.33175355450236965,
  'f1-score': 0.46052631578947373,
  'support': 211},
 'reliable': {'precision': 0.5714285714285714,
  'recall': 0.8909952606635071,
  'f1-score': 0.6962962962962962,
  'support': 211},
 'micro avg': {'precision': 0.6113744075829384,
  'recall': 0.6113744075829384,
  'f1-score': 0.6113744075829384,
  'support': 422},
 'macro avg': {'precision': 0.6620583717357911,
  'recall': 0.6113744075829384,
  'f1-score': 0.5784113060428849,
  'support': 422},
 'weighted avg': {'precision': 0.6620583717357911,
  'recall': 0.6113744075829384,
  'f1-score': 0.578411306042885,
  'support': 422}}

In [16]:
confusion_matrix(newy, model.predict(X_test), labels=['reliable', 'fake'])

array([[188,  23],
       [141,  70]])