In [72]:
import collections 
import numpy as np
import pandas as pd
from collections import Counter
import operator

## The data sample
<table align="left">
    <tr>
    <td>Type</td>
    <td>No</td>
    <td>Doc</td>
    <td>Class</td>
    </tr>
<tr>
    <td>Training</td>
    <td>1</td>
    <td>Chinese Beijing Chinese</td>
    <td>c</td>
</tr>  
    <tr>
    <td>Training</td>
    <td>2</td>
    <td>Chinese Chinese Shanghai</td>
    <td>c</td>
</tr> 
    <tr>
    <td>Training</td>
    <td>3</td>
    <td>Chinese Macao</td>
    <td>c</td>
</tr> 
    <tr>
    <td>Training</td>
    <td>4</td>
    <td>Tokyo Japan Chinese</td>
    <td>j</td>
</tr> 
     <tr>
    <td>Test</td>
    <td>5</td>
    <td>Chinese Chinese Chinese Tokyo Japan</td>
    <td>?</td>
</tr> 
</table>


In [73]:
trainingdata = pd.DataFrame()

trainingdata['Doc'] = ['Chinese Beijing Chinese','Chinese Chinese Shanghai','Chinese Macao','Tokyo Japan Chinese']

trainingdata['class'] = ['c','c','c','j']

trainingdata

Unnamed: 0,Doc,class
0,Chinese Beijing Chinese,c
1,Chinese Chinese Shanghai,c
2,Chinese Macao,c
3,Tokyo Japan Chinese,j


In [74]:
testdata = pd.DataFrame()

testdata['Doc'] = ['Chinese Chinese Chinese Tokyo Japan']

testdata

Unnamed: 0,Doc
0,Chinese Chinese Chinese Tokyo Japan


### Calculate prior probability of each class

In [75]:
def priorprob(df,thisClass):
    totalClass = df['class'].count()
    
    count_thisClass = df['class'][df['class'] == thisClass].count()
    priorprobVal = count_thisClass / totalClass # count of each class divided by total no of classes
    return priorprobVal

In [76]:
trainingdata['class'].value_counts()

classlist = trainingdata['class'].unique().tolist()

for thisClass in classlist:     
    p = priorprob(trainingdata,thisClass)
    print("prior probability of class " + thisClass + " is " + str(p))

prior probability of class c is 0.75
prior probability of class j is 0.25


### Calculate likelihood & Posterior Probabilities

For each word w<sub>k</sub> in the vocabulary, calculate the number of occurrences of w<sub>k</sub>
    

In [77]:
def condProb(wordOccurrence,totTokens,types):
    condProbVal = (wordOccurrence + 1 )/ (totTokens + types)
    return condProbVal

In [78]:
def posteriorProb(df):
    
    llhdict = {} #likelihood dictionary
    ppdict = {} #posterior prob values dictionary
    
    testdataWords =  Counter(" ".join(df['Doc']).split(" ")).items()

    uniqueWords = list(set(" ".join(trainingdata['Doc']).lower().split(" ")))
    types = len(uniqueWords)
    
    for i in classlist:
        #create subset for this class
        trainingdata_i = trainingdata[trainingdata['class']==i]

        wordCountDict = Counter(" ".join(trainingdata_i['Doc']).split(" "))
    
        #count of all words in the class
        allwords_i = list(trainingdata_i['Doc'].str.split())

        totalWords_i = 0;
        for j in allwords_i:
            totalWords_i+= len(j)

        #calculate likelihood
        for word in testdataWords:

            total_occurrence =  wordCountDict[word[0]] #how many times has the word appeared in this class in training data
            #calculate conditional probability of P(word|class)
            likelihoodVal = condProb(total_occurrence,totalWords_i,types)
            print("P(" + word[0] + "|"+i+")" + ": " + str(likelihoodVal))
            keyval = word[0]+"|"+i # P(word|class)
            #form a dictionary of likelihood values
            llhdict[keyval] = likelihoodVal
            
        print("\n")
        
        #initialize with prior probability value first
        postProbVal = priorprob(trainingdata,i)
        
        testTokens = df['Doc'].str.split(" ")

        for k in testTokens[0]:
            keyval2 = k +"|"+i
            postProbVal = postProbVal * llhdict[keyval2]
        print("Posterior probability of class " + i + " given the test data : " + str(postProbVal))
        ppdict[i] = postProbVal
        print("\n")
       
    print("The maximum of the posterior probabilities belong to class " +max(ppdict.items(), key=operator.itemgetter(1))[0])

In [79]:
posteriorProb(testdata)

P(Chinese|c): 0.42857142857142855
P(Tokyo|c): 0.07142857142857142
P(Japan|c): 0.07142857142857142


Posterior probability of class c given the test data : 0.00030121377997263036


P(Chinese|j): 0.2222222222222222
P(Tokyo|j): 0.2222222222222222
P(Japan|j): 0.2222222222222222


Posterior probability of class j given the test data : 0.00013548070246744226


The maximum of the posterior probabilities belong to class c
