In [1]:
mkdir NaiveBayes

In [1]:
%%writefile NaiveBayes/chineseExample.txt
D1	1	Chinese Beijing	Chinese
D2	1	Chinese Chinese	Shanghai
D3	1	Chinese	Macao
D4	0	Tokyo Japan	Chinese
D5	0	Chinese Chinese	Chinese Tokyo Japan

Overwriting NaiveBayes/chineseExample.txt


In [27]:
%%writefile MRNaiveBayesTrainer.py

"""An implementation of wc as an MRJob.
This is meant as an example of why mapper_final is useful."""
from mrjob.job import MRJob

class MRNaiveBayesTrainer(MRJob):

    def __init__(self, *args, **kwargs):
        super(MRNaiveBayesTrainer, self).__init__(*args, **kwargs)
        self.modelStats = {}

    def mapper(self, _, line):
        # Don't actually yield anything for each line. Instead, collect them
        # and yield the sums when all lines have been processed. The results
        # will be collected by the reducer.
        docID, docClass,text = line.split("\t",2)   
        words = text.split()
        if docID != "D5":  #skip doc d5 in chinese dataset
            if docClass == "1":
                yield("TomsPriors", "0,1")
                for word in words:
                    yield(word, "0,1")
            else:
                yield("TomsPriors", "1,0")
                for word in words:
                    yield(word, "1,0")
        

    def reducer(self, word, values):
        #aggregate counts for Pr(Word|Class)
        #yield("number of values for "+word, str(values))
        w0Total=0
        w1Total=0
        for value in values:
            w0, w1 =  value.split(",")
            w0Total += float(w0)
            w1Total += float(w1)  
        self.modelStats[word] =  [w0Total, w1Total]

        #yield("JIMI "+word, [w0Total, w1Total])
    def reducer_final(self):
        
        class0Total = 0
        class1Total = 0
        for k in self.modelStats.keys():
            if k != "TomsPriors":
                class0Total += self.modelStats[k][0]
                class1Total += self.modelStats[k][1]
        vocabularySize = len(self.modelStats.keys()) -1  #ignore TomsPriors
        yield ("defaultPrior 0 class", class0Total+vocabularySize)
        yield ("defaultPrior 1 class", class1Total+vocabularySize)
        yield ("count 0 class", class0Total)
        yield ("count 1 class", class1Total)
        yield ("vocabularySize", vocabularySize)
        #calculate priors 
        classCount0, classCount1 = self.modelStats.get("TomsPriors")
        del self.modelStats["TomsPriors"]
        total = classCount0 + classCount1
        yield("TomsPriors", ','.join(str(j) for j in [classCount0, classCount1, classCount0/total, classCount1/total])) 
        for k in self.modelStats.keys():
            yield(k, ','.join(str(j) for j in [self.modelStats[k][0],
                      self.modelStats[k][1],
                      (self.modelStats[k][0] + 1) /(class0Total + vocabularySize), 
                      (self.modelStats[k][1] +1)/(class1Total+vocabularySize)]))        
 

if __name__ == '__main__':
    MRNaiveBayesTrainer.run()

Overwriting MRNaiveBayesTrainer.py


In [28]:
!python MRNaiveBayesTrainer.py NaiveBayes/chineseExample.txt

no configs found; falling back on auto-configuration
no configs found; falling back on auto-configuration
creating tmp directory /var/folders/j4/95k348x940xcz40fkdmgy_n40000gn/T/MRNaiveBayesTrainer.jshanahan.20160616.135016.303473
writing to /var/folders/j4/95k348x940xcz40fkdmgy_n40000gn/T/MRNaiveBayesTrainer.jshanahan.20160616.135016.303473/step-0-mapper_part-00000
Counters from step 1:
  (no counters found)
writing to /var/folders/j4/95k348x940xcz40fkdmgy_n40000gn/T/MRNaiveBayesTrainer.jshanahan.20160616.135016.303473/step-0-mapper-sorted
> sort /var/folders/j4/95k348x940xcz40fkdmgy_n40000gn/T/MRNaiveBayesTrainer.jshanahan.20160616.135016.303473/step-0-mapper_part-00000
writing to /var/folders/j4/95k348x940xcz40fkdmgy_n40000gn/T/MRNaiveBayesTrainer.jshanahan.20160616.135016.303473/step-0-reducer_part-00000
Counters from step 1:
  (no counters found)
Moving /var/folders/j4/95k348x940xcz40fkdmgy_n40000gn/T/MRNaiveBayesTrainer.jshanahan.20160616.135016.303473/step-0-reducer_part-00000 -

In [29]:
%reload_ext autoreload
%autoreload 2

from numpy import random
from MRNaiveBayesTrainer import MRNaiveBayesTrainer 

# STEP 1: Train a mulitnomial Naive Bayes      

mr_job = MRNaiveBayesTrainer(args=['NaiveBayes/chineseExample.txt', '--file=NaiveBayes/model.txt'])
modelStats={}
with mr_job.make_runner() as runner: 
    runner.run()
        # stream_output: get access of the output 
    for line in runner.stream_output():
        key,value =  mr_job.parse_output_line(line)
        print key, value
        modelStats[key] = value
            
        # Update the centroids for the next iteration
    with open('NaiveBayes/model1.txt', 'w') as f:
        for k in modelStats.keys():
            #f.writelines(k+"\t"+modelStats[k])
            f.writelines( k + "\t"+ str(modelStats[k]) +"\n")
            #print k, modelStats[k][0]
            #f.writelines("%s,%d,%d,%f,%f" %(k, modelStats[k][0],modelStats[k][1],modelStats[k][2],modelStats[k][3]))

            
# STEP 2: Classify data with newly trained model      

print modelStats

defaultPrior 0 class 9.0
defaultPrior 1 class 14.0
count 0 class 3.0
count 1 class 8.0
vocabularySize 6
TomsPriors 1.0,3.0,0.25,0.75
Beijing 0.0,1.0,0.111111111111,0.142857142857
Chinese 1.0,5.0,0.222222222222,0.428571428571
Tokyo 1.0,0.0,0.222222222222,0.0714285714286
Shanghai 0.0,1.0,0.111111111111,0.142857142857
Japan 1.0,0.0,0.222222222222,0.0714285714286
Macao 0.0,1.0,0.111111111111,0.142857142857
{'defaultPrior 0 class': 9.0, 'Shanghai': '0.0,1.0,0.111111111111,0.142857142857', 'Chinese': '1.0,5.0,0.222222222222,0.428571428571', 'count 1 class': 8.0, 'defaultPrior 1 class': 14.0, 'Tokyo': '1.0,0.0,0.222222222222,0.0714285714286', 'vocabularySize': 6, 'TomsPriors': '1.0,3.0,0.25,0.75', 'count 0 class': 3.0, 'Japan': '1.0,0.0,0.222222222222,0.0714285714286', 'Macao': '0.0,1.0,0.111111111111,0.142857142857', 'Beijing': '0.0,1.0,0.111111111111,0.142857142857'}


In [26]:
!cat NaiveBayes/model1.txt

defaultPrior 0 class	9.0
Shanghai	0.0,1.0,0.111111111111,0.142857142857
Chinese	1.0,5.0,0.222222222222,0.428571428571
count 1 class	8.0
defaultPrior 1 class	14.0
Tokyo	1.0,0.0,0.222222222222,0.0714285714286
vocabularySize	6
TomsPriors	1.0,3.0,0.25,0.75
count 0 class	3.0
Japan	1.0,0.0,0.222222222222,0.0714285714286
Macao	0.0,1.0,0.111111111111,0.142857142857
Beijing	0.0,1.0,0.111111111111,0.142857142857


In [14]:
[map(float,s.split('\n')[0].split(',')) for s in open("NaiveBayes/model.txt").readlines()]

ValueError: could not convert string to float: d

In [12]:
ls NaiveBayes


MRNaiveBayesTrainer.py  chineseExample.txt      model.txt.txt
