In [180]:
#http://csmining.org/index.php/r52-and-r8-of-reuters-21578.html
#https://web.stanford.edu/class/cs124/lec/naivebayes.pdf

import numpy
import math
import nltk
import re
from urllib.request import urlopen
from nltk import FreqDist
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn import metrics
import matplotlib.pyplot as plt
from pandas_ml import ConfusionMatrix

def initialize_train():
    train_url = "http://csmining.org/tl_files/Project_Datasets/r8%20r52/r8-train-all-terms.txt"
    train_doc = urlopen(train_url).read() 
    train_doc = train_doc.decode("utf-8")
    return train_doc #returns all docs as one String

def initialize_test():
    test_url = "http://csmining.org/tl_files/Project_Datasets/r8%20r52/r8-test-all-terms.txt"
    test_doc = urlopen(test_url).read() 
    test_doc = test_doc.decode("utf-8")
    return test_doc #returns all docs as one String



def CountAllCategories(doc):
    pattern='[A-z]*-*[A-z]*\t'
    f = re.findall(pattern,doc)
    return Counter(f) #returns dictionary 



test_doc = initialize_test()      # all docs as one String
train_doc = initialize_train()    # all docs as one String

train_vocab = sorted(list(set(word_tokenize(train_doc))))

pattern1 = r"([\w+]*[-]*[\w+]*[\t])(.*)(\b)"
test_docs = re.findall(pattern1,test_doc) #note that categories are accompanied with "/t" i.e. "earn" is NOT a category but "earn\t" is
train_docs = re.findall(pattern1,train_doc) #all docs is in one list now, list of tuples, each tuple is one doc = [(class, document),(class, document),(class, document)]

def CalculateClassProb():
    probabilities = {}
    totalDocs = sum(Dict4CountOfCategories.values())
    for category in Dict4CountOfCategories.keys():
        probabilities[category] = Dict4CountOfCategories[category]/totalDocs
    return probabilities


LikelihoodProbabilities = {}

def appendGivenCategoryDocsIntoOne(docs,cat):
    s=""
    for doc in docs:
        if (doc[0] == cat):
            s=s+doc[1]
    return s

trainDocumentCategoryWise={}

for c_j in Dict4CountOfCategories.keys():
    trainDocumentCategoryWise[c_j] = appendGivenCategoryDocsIntoOne(train_docs,c_j)


#CountAllCategories(test_doc)
#CountAllCategories(train_doc)


In [183]:
Dict4CountOfCategories = dict(CountAllCategories(train_doc))
print(Dict4CountOfCategories)

{'earn\t': 2840, 'acq\t': 1596, 'trade\t': 251, 'ship\t': 108, 'grain\t': 41, 'crude\t': 253, 'interest\t': 190, 'money-fx\t': 206}


In [4]:

Class_probabilities = CalculateClassProb()
print(Class_probabilities)

{'earn\t': 0.5177757520510483, 'acq\t': 0.290975387420237, 'trade\t': 0.04576116681859617, 'ship\t': 0.019690063810391976, 'grain\t': 0.00747493163172288, 'crude\t': 0.046125797629899726, 'interest\t': 0.03463992707383774, 'money-fx\t': 0.03755697356426618}


In [None]:

def CalculateLikelihoodProb(vocab,DocumentCategoryWise):
    aplha_smooth = 1
    probabilities = {}
    for w_k in train_vocab: #take each word from vocab
        for c_j in Dict4CountOfCategories.keys():
            n_k = len(re.findall(w_k+" ",DocumentCategoryWise[c_j]))
            n=len(DocumentCategoryWise[c_j])  #total number of words in c_j
            probabilities[(w_k,c_j)] = (n_k+aplha_smooth)/(n+aplha_smooth*len(vocab))
        #P(word|class)=(word_count_in_class + 1)/(total_words_in_class+total_unique_words_in_all_classes
    return probabilities

probabilities = CalculateLikelihoodProb(train_vocab,trainDocumentCategoryWise)

#print(probabilities)


  

def CalculateLikelihoodProbONDemand(vocab,DocumentCategoryWise,w_k, c_j):
    aplha_smooth=1
    n_k = len(re.findall(w_k+" ",DocumentCategoryWise[c_j])) #number of times word occur in all docs with c_j
    n=len(DocumentCategoryWise[c_j])  #total number of words in c_j
    LikelihoodProbabilities[(w_k,c_j)] = (n_k+aplha_smooth)/(n+aplha_smooth*len(vocab))
    return LikelihoodProbabilities[(w_k,c_j)]




In [195]:
def get_true_labels(docs):
    true_labels = list()
    for doc in docs:
        true_label = doc[0]
        true_labels.append(true_label)
    return true_labels


def predict_model(docs):
    predicted_labels=list()
    for doc in docs:
        d={}
        for c_j in Dict4CountOfCategories.keys():
            p=1
            for w_k in set(word_tokenize(doc[1])):
                if ((w_k,c_j) in probabilities):
                    p=p*probabilities[(w_k,c_j)]
                else: p = p*CalculateLikelihoodProbONDemand(train_vocab,trainDocumentCategoryWise,w_k,c_j)
                #else: p = p*(1/(Dict4CountOfCategories[c_j]+len(train_vocab)))
            d[c_j]=p*Class_probabilities[c_j]
        predicted_label = max(d, key=d.get)
        predicted_labels.append(predicted_label)
    return predicted_labels

def Calculate_accuracy(true_labels,predicted_labels):
    CorrectClassify=0
    for i in range(len(true_labels)):
        if (predicted_labels[i]==true_labels[i]):
                CorrectClassify=CorrectClassify+1
    return CorrectClassify/len(true_labels)

test_predicted_labels=predict_model(test_docs)
train_predicted_labels=predict_model(train_docs)


test_true_labels=get_true_labels(test_docs)
train_true_labels=get_true_labels(train_docs)


print("Test Accuracy - ", Calculate_accuracy(test_true_labels,test_predicted_labels))
print("Train Accuracy - ", Calculate_accuracy(train_true_labels,train_predicted_labels))

#using built in method of accuracy-
#print("Test Accuracy", metrics.accuracy_score(test_true_labels,test_predicted_labels))



Test Accuracy -  0.8158976701690269
Train Accuracy -  0.8211485870556062


 Confusion Matrix

In [218]:
def make_confusionMetrix(true_labels,predicted_labels ):
    true_labels=[s.strip() for s in true_labels]
    predicted_labels=[s.strip() for s in predicted_labels]
    cm =ConfusionMatrix(true_labels, predicted_labels)
    print(cm, end="\n\n\n")

print("Confusion Matrix for Train - ",end="\n\n")
make_confusionMetrix(train_true_labels,train_predicted_labels )

print("------------------------------------------------------------------------------------- ",end="\n\n")

print("Confusion Matrix for Test - ",end="\n\n")
make_confusionMetrix(test_true_labels,test_predicted_labels )

Confusion Matrix for Train - 

Predicted   acq  crude  earn  grain  interest  money-fx  ship  trade  __all__
Actual                                                                       
acq        1172      3   406      8         4         0     3      0     1596
crude         1    115   135      0         0         0     1      1      253
earn         23      4  2800      6         3         1     3      0     2840
grain         0      0    19     22         0         0     0      0       41
interest      0      0    45      0       145         0     0      0      190
money-fx      1      0    75      1        19       110     0      0      206
ship          0      0    45      1         0         0    62      0      108
trade         0      1   164      0         2         5     1     78      251
__all__    1197    123  3689     38       173       116    70     79     5485


------------------------------------------------------------------------------------- 

Confusion Matrix for 