# Niave Bayes for FakeOrReal CSV
* Make sure you have the file fake_or_real_news.csv in the same directory as this ipython notebook file

In [34]:
import pandas as pd #To prepare the data
import numpy as np #For the log function
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

### loading the data
Things I changed: 
* Changed name of movie_data to FoR_data
* FoR => Fake or Real

* Added new parameter to the .read_csv fucntion.
* To mimic the "movies" implementation, I extracted only two columns,
* these columns being LABEL and TEXT.


### NOTICE ! 
* In this case, the first column is text and the second is the label.
* In our "movies" implementation it was the other way around.
* I couldn't figure out how to make the label be the first column.
* It was easier for me to later SWAP the indexes in the loops that iterated the data.

In [35]:
#import the data using pandas
FoR_data = pd.read_csv('fake_or_real_news.csv', usecols=['label', 'text'])
print("There are {} rows in the data set".format(len(movie_data)))
FoR_data.head()

There are 6335 rows in the data set


Unnamed: 0,text,label
0,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,It's primary day in New York and front-runners...,REAL


### Divide the data into training and testing data
Things I changed:
* Previously we took 400 rows to train. For this implementation we should take more.
* And please do because training on 6000ish columns takes over an hour.
* Learned that the hard way.

In [36]:
#First randomly select 20% of the data
FoR_sample = FoR_data.sample(1200)
#The training data and the testing data MUST be seperated 
training_frame = FoR_data[~FoR_data.index.isin(FoR_sample.index)]
testing_frame = FoR_sample

In [37]:
def preprocessing(s):
    stop_words = set(stopwords.words("english"))
    s = s.lower()
    s = word_tokenize(s)
    s = [word for word in s if not word in stop_words]
    return s

### Define the dictionaries C, D
Changes:
* We use "label" instead of "class".
* Which now are FAKE or REAL
* Also, because I couldn't quite figure out how to how to properly immitate the movie data structure,
* I changed the indexes from 0 to 1 and viceversa.
* The rest remains the same.

In [38]:
C = set(training_frame['label']) # FAKE, REAL
D = dict()
for i in range(len(training_frame)):
    D[training_frame.iloc[i,0]] = training_frame.iloc[i,1]

### The Naive Bayes algorithm

In [39]:
def train_NB(C,D):
    V = set([word for doc in D.keys() for word in preprocessing(doc)])
    N = len(D)
    prior = dict()
    cond_prob = dict()
    N_c = dict()
    T = dict()
    text_c = dict()
    for c in C:
        text_c[c] = []
        for doc in D.items():
            if doc[1] == c:
                for word in doc[0].split():
                    text_c[c].append(word)
        N_c[c] = len([doc for doc in D.items() if doc[1] == c])
        prior[c] = float(N_c[c])/N
        cond_prob[c] = dict()
        T[c] = dict()
        for term in V:
            T[c][term] = text_c[c].count(term)
        for term in V:
            cond_prob[c][term] = float(T[c][term] + 1)/(sum(T[c].values()) + len(V))
 
    return V, prior, cond_prob

In [40]:
def test_NB(C,V,prior,cond_prob,d):
    W = []
    for word in d.split():
        if word in V:
            W.append(word)
    score = dict()
    for c in C:
        score[c] = np.log(prior[c])
        for term in W:
            score[c] += np.log(cond_prob[c][term])
    max_category = sorted(score.items(),key=lambda x: x[1],reverse= True)[0][0]
    return max_category

### Training
* on my computer this took WAY MORE than 10 minutes.
* Started: 3:17 pm
* Ended: 4:37 pm

In [41]:
V, prior, cond_prob = train_NB(C,D)

### Accuracy Display
Changes: 
* Only swapped the indexes from 1 to 0 and viceversa again to fit the "new" structure

In [42]:
correct = 0
incorrect = 0
for i in range(len(testing_frame)): 
    if test_NB(C,V,prior,cond_prob,testing_frame.iloc[i,0]) == testing_frame.iloc[i,1]:
        correct += 1
    else:
        incorrect += 1
    print (test_NB(C,V,prior,cond_prob,testing_frame.iloc[i,0]), testing_frame.iloc[i,1], correct/(correct+incorrect))
accuracy = correct/(correct + incorrect)
print("Accuracy = {} %".format(accuracy))

REAL REAL 1.0
FAKE FAKE 1.0
REAL REAL 1.0
REAL REAL 1.0
REAL REAL 1.0
REAL REAL 1.0
FAKE FAKE 1.0
FAKE FAKE 1.0
FAKE FAKE 1.0
REAL FAKE 0.9
REAL REAL 0.9090909090909091
FAKE FAKE 0.9166666666666666
REAL REAL 0.9230769230769231
REAL FAKE 0.8571428571428571
FAKE FAKE 0.8666666666666667
REAL REAL 0.875
REAL FAKE 0.8235294117647058
REAL REAL 0.8333333333333334
REAL REAL 0.8421052631578947
REAL REAL 0.85
FAKE FAKE 0.8571428571428571
FAKE FAKE 0.8636363636363636
REAL REAL 0.8695652173913043
REAL REAL 0.875
FAKE FAKE 0.88
FAKE REAL 0.8461538461538461
REAL REAL 0.8518518518518519
FAKE FAKE 0.8571428571428571
FAKE FAKE 0.8620689655172413
FAKE FAKE 0.8666666666666667
FAKE FAKE 0.8709677419354839
FAKE FAKE 0.875
REAL REAL 0.8787878787878788
REAL REAL 0.8823529411764706
REAL REAL 0.8857142857142857
REAL REAL 0.8888888888888888
REAL REAL 0.8918918918918919
FAKE FAKE 0.8947368421052632
REAL REAL 0.8974358974358975
FAKE FAKE 0.9
FAKE FAKE 0.9024390243902439
FAKE FAKE 0.9047619047619048
FAKE FAKE 0.90

REAL REAL 0.8904899135446686
REAL REAL 0.8908045977011494
REAL REAL 0.8911174785100286
REAL FAKE 0.8885714285714286
REAL REAL 0.8888888888888888
REAL REAL 0.8892045454545454
REAL REAL 0.8895184135977338
FAKE FAKE 0.8898305084745762
FAKE FAKE 0.8901408450704226
FAKE REAL 0.8876404494382022
FAKE FAKE 0.8879551820728291
REAL REAL 0.888268156424581
REAL REAL 0.8885793871866295
REAL REAL 0.8888888888888888
REAL REAL 0.889196675900277
FAKE FAKE 0.8895027624309392
REAL REAL 0.8898071625344353
REAL REAL 0.8901098901098901
REAL REAL 0.8904109589041096
FAKE FAKE 0.8907103825136612
REAL FAKE 0.888283378746594
REAL REAL 0.8885869565217391
FAKE FAKE 0.8888888888888888
REAL REAL 0.8891891891891892
FAKE FAKE 0.889487870619946
FAKE FAKE 0.8897849462365591
REAL REAL 0.8900804289544236
FAKE FAKE 0.8903743315508021
FAKE FAKE 0.8906666666666667
REAL REAL 0.8909574468085106
FAKE FAKE 0.8912466843501327
FAKE FAKE 0.8915343915343915
REAL REAL 0.8918205804749341
FAKE FAKE 0.8921052631578947
REAL FAKE 0.889763

REAL REAL 0.8850072780203785
REAL FAKE 0.8837209302325582
FAKE FAKE 0.8838896952104499
FAKE FAKE 0.8840579710144928
FAKE FAKE 0.8842257597684515
REAL FAKE 0.8829479768786127
FAKE FAKE 0.8831168831168831
FAKE FAKE 0.8832853025936599
REAL FAKE 0.8820143884892087
FAKE REAL 0.8807471264367817
FAKE FAKE 0.8809182209469153
FAKE FAKE 0.8810888252148997
REAL REAL 0.8812589413447782
FAKE FAKE 0.8814285714285715
REAL REAL 0.8815977175463623
REAL FAKE 0.8803418803418803
REAL REAL 0.8805120910384068
REAL FAKE 0.8792613636363636
FAKE FAKE 0.8794326241134752
FAKE FAKE 0.8796033994334278
REAL FAKE 0.8783592644978784
FAKE FAKE 0.8785310734463276
REAL REAL 0.8787023977433004
FAKE FAKE 0.8788732394366198
FAKE FAKE 0.8790436005625879
REAL REAL 0.8792134831460674
FAKE FAKE 0.879382889200561
REAL REAL 0.8795518207282913
FAKE FAKE 0.8797202797202798
REAL REAL 0.8798882681564246
REAL REAL 0.8800557880055788
REAL REAL 0.8802228412256268
FAKE FAKE 0.8803894297635605
REAL REAL 0.8805555555555555
FAKE FAKE 0.880

FAKE FAKE 0.8846153846153846
FAKE FAKE 0.8847262247838616
REAL FAKE 0.8838771593090211
REAL REAL 0.8839884947267498
REAL REAL 0.8840996168582376
REAL REAL 0.8842105263157894
FAKE FAKE 0.884321223709369
REAL REAL 0.8844317096466093
FAKE FAKE 0.8845419847328244
FAKE FAKE 0.8846520495710201
FAKE FAKE 0.8847619047619047
REAL REAL 0.884871550903901
FAKE FAKE 0.8849809885931559
REAL REAL 0.8850902184235517
FAKE FAKE 0.8851992409867173
REAL REAL 0.885308056872038
REAL REAL 0.8854166666666666
REAL REAL 0.8855250709555346
REAL REAL 0.8856332703213611
REAL REAL 0.8857412653446648
REAL FAKE 0.8849056603773585
REAL REAL 0.8850141376060321
REAL REAL 0.8851224105461394
REAL REAL 0.8852304797742239
FAKE FAKE 0.8853383458646616
REAL REAL 0.8854460093896713
REAL REAL 0.8855534709193246
REAL REAL 0.8856607310215557
REAL REAL 0.8857677902621723
REAL FAKE 0.8849391955098223
REAL REAL 0.8850467289719626
REAL REAL 0.8851540616246498
REAL FAKE 0.8843283582089553
REAL REAL 0.8844361602982292
REAL REAL 0.88454

# About 80% accuracy