In [1]:
# Import dependencies 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from path import Path
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [2]:
# Read CSV
df = pd.read_csv('articles_1.csv',sep='\t')
#df = pd.read_csv('articles_1.csv',error_bad_lines=False)
df.head(5)

Unnamed: 0,articlesid,fakeid,trueid,title,text,subject,label
0,1,1.0,,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,US News,1
1,2,2.0,,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,US News,1
2,3,3.0,,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",US News,1
3,4,4.0,,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",US News,1
4,5,5.0,,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,US News,1


In [3]:
# Take sample of total dataframe to confirm working code
sample_df = df.sample(frac=.10, replace = True, random_state=1)
sample_df

Unnamed: 0,articlesid,fakeid,trueid,title,text,subject,label
33003,32986,,15677.0,Zimbabwe court upholds charge against U.S. cit...,HARARE (Reuters) - A Zimbabwean court on Satur...,World News,0
12172,11632,16009.0,,UNREAL! THREE REPUBLICAN SENATORS Kill Republi...,It s clear that some Republicans never wanted ...,US News,1
5192,4764,4580.0,,It Looks Like The Trump Campaign Just Got Thi...,While Donald Trump accuses Hillary Clinton of ...,US News,1
32511,32476,,15161.0,Pope says world should condemn 'very possessio...,"VATICAN CITY (Reuters) - Pope Francis, in some...",World News,0
7813,7216,6993.0,,Donald Trump Could Be Running The Greatest Sc...,Donald Trump likes to promote himself a shrewd...,US News,1
...,...,...,...,...,...,...,...
33186,33170,,15852.0,Poland to ban Ukrainians with 'anti-Polish views',WARSAW (Reuters) - Poland plans to bar Ukraini...,World News,0
1083,948,867.0,,WATCH: Anderson Cooper Throws MAJOR Shade At ...,An interview with CNN anchor Anderson Cooper a...,US News,1
16878,16263,21452.0,,MUSLIM CLOCK BOY’S LIE EXPOSED [Video] Expert ...,Everything about this muslim boy s story is a ...,US News,1
14222,13650,18611.0,,CROOKED HILLARY CLINTON’S Latest Speech To Be ...,Hillary Clinton always goes back to the black ...,US News,1


In [4]:
# Drop unused columns in model testing
sample_df.drop(['articlesid','fakeid','trueid','subject'],axis=1,inplace=True)
sample_df

Unnamed: 0,title,text,label
33003,Zimbabwe court upholds charge against U.S. cit...,HARARE (Reuters) - A Zimbabwean court on Satur...,0
12172,UNREAL! THREE REPUBLICAN SENATORS Kill Republi...,It s clear that some Republicans never wanted ...,1
5192,It Looks Like The Trump Campaign Just Got Thi...,While Donald Trump accuses Hillary Clinton of ...,1
32511,Pope says world should condemn 'very possessio...,"VATICAN CITY (Reuters) - Pope Francis, in some...",0
7813,Donald Trump Could Be Running The Greatest Sc...,Donald Trump likes to promote himself a shrewd...,1
...,...,...,...
33186,Poland to ban Ukrainians with 'anti-Polish views',WARSAW (Reuters) - Poland plans to bar Ukraini...,0
1083,WATCH: Anderson Cooper Throws MAJOR Shade At ...,An interview with CNN anchor Anderson Cooper a...,1
16878,MUSLIM CLOCK BOY’S LIE EXPOSED [Video] Expert ...,Everything about this muslim boy s story is a ...,1
14222,CROOKED HILLARY CLINTON’S Latest Speech To Be ...,Hillary Clinton always goes back to the black ...,1


In [5]:
# Defining the feature set X and y; model test on TEXT
X = sample_df.text
y = sample_df.label

# Split data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, test_size=0.20)

In [6]:
# Implement CountVectorizer which tokenizes and counts occurance of words in text
cv = CountVectorizer()
X_train_count=cv.fit_transform(X_train.values)
X_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [7]:
# Usng Multinomial algorithm of Naive Bayes for model training
model=MultinomialNB()
model.fit(X_train_count,y_train)

MultinomialNB()

In [8]:
# Testing on sample df text
article_text = cv.transform(X)
model.predict(article_text)

array([0, 1, 1, ..., 1, 1, 0], dtype=int64)

In [9]:
# Finding the accuracy of the Multinomial Navie Bayes model: MultinomialNB is suitable for classificaion with discrete features
X_test_count = cv.transform(X_test)
model.score(X_test_count,y_test)

0.9417852522639069

In [11]:
# Save model to disk
import pickle
filename = 'nb_model.sav'
pickle.dump(model, open(filename, 'wb'))