# Naive-Bayes using gensim

In [6]:
import numpy as np
import os
import json
import pandas as pd

import sys
sys.path.append('../..')

import utils.dbUtils
import utils.gensimUtils

import sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer


from pymongo import MongoClient
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

## Training on the full corpus

In [37]:
corpus = utils.dbUtils.TokenizedIterator('news_copy', filters = {'type' : {'$in' : ['fake', 'reliable']}})

In [38]:
#vectorizer = TfidfVectorizer()
#X = vectorizer.fit_transform([''.join(x) for x in corpus])
y = np.array([x for x in corpus.iterTags()])

In [40]:
model = MultinomialNB()

train_accuracy = []
test_accuracy = []
kf = KFold(n_splits=10, shuffle = True)
for train_index, test_index in kf.split(y):
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform([' '.join(corpus[i]) for i in train_index])
    X_test = vectorizer.transform([' '.join(corpus[i]) for i in test_index])
    y_train = y[train_index]
    y_test = y[test_index]
    model.fit(X_train, y_train)
    train_accuracy.append(model.score(X_train, y_train))
    test_accuracy.append(model.score(X_test, y_test))
    #print("Training accuracy : {}".format(model.score(X_train, y_train)))
    #print("Test accuracy : {}".format(model.score(X_test, y_test)))
    #print("Classification report for test set")
    #print(classification_report(y_test, model.predict(X_test)))
print("Train accuracy : {}".format(np.mean(train_accuracy)))
print("Test accuracy : {}".format(np.mean(test_accuracy)))
print(test_accuracy)

Train accuracy : 0.910612013212434
Test accuracy : 0.9056206386979625
[0.9053369706121169, 0.90554708756535, 0.905760765822875, 0.9051945184404336, 0.9057287140842463, 0.9057497462561656, 0.9057996046938158, 0.9052155487099129, 0.9061592977082925, 0.9057141330864154]


## Training on the corpus that does not contains the news from nytimes.com and beforeitsnews.com

In [4]:
corpus = utils.dbUtils.TokenizedIterator('news_copy', filters = {'type' : {'$in' : ['fake', 'reliable']}, 'domain' : {'$nin' : ['nytimes.com', 'beforeitsnews.com']}})
y = np.array([x for x in corpus.iterTags()])

In [None]:
model = MultinomialNB()

train_accuracy = []
test_accuracy = []
train_recall = []
test_recall = []
kf = KFold(n_splits=10, shuffle = True)
for train_index, test_index in kf.split(y):
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform([' '.join(corpus[i]) for i in train_index])
    X_test = vectorizer.transform([' '.join(corpus[i]) for i in test_index])
    y_train = y[train_index]
    y_test = y[test_index]
    model.fit(X_train, y_train)
    train_accuracy.append(model.score(X_train, y_train))
    test_accuracy.append(model.score(X_test, y_test))
    train_recall.append(recall_score(y_train, model.predict(X_train), pos_label = 'fake'))
    test_recall.append(recall_score(y_test, model.predict(X_test), pos_label = 'fake'))
    #print("Training accuracy : {}".format(model.score(X_train, y_train)))
    #print("Test accuracy : {}".format(model.score(X_test, y_test)))
    #print("Classification report for test set")
    #print(classification_report(y_test, model.predict(X_test)))
print("Train accuracy : {}".format(np.mean(train_accuracy)))
print("Test accuracy : {}".format(np.mean(test_accuracy)))
print("Train recall : {}".format(np.mean(train_recall)))
print("Test recall : {}".format(np.mean(test_recall)))
print(test_accuracy)
print(train_recall)
print(test_recall)