In [48]:
%%writefile EnronNaiveBayesTrainer.py
#HW 1.3

from mrjob.job import MRJob
import sys, re, string, operator

regex = re.compile('[%s]' % re.escape(string.punctuation))

class EnronNaiveBayesTrainer(MRJob):
    
    def __init__(self, *args, **kwargs):
        super(EnronNaiveBayesTrainer, self).__init__(*args, **kwargs)
        self.modelStats = {}


    def jobconf(self):
        orig_jobconf = super(EnronNaiveBayesTrainer, self).jobconf()        
        custom_jobconf = {
            'mapred.output.key.comparator.class': 'org.apache.hadoop.mapred.lib.KeyFieldBasedComparator',
            'mapred.text.key.comparator.options': '-k1rn',
            'mapred.reduce.tasks': '1',
        }
        combined_jobconf = orig_jobconf
        combined_jobconf.update(custom_jobconf)
        self.jobconf = combined_jobconf
        return combined_jobconf

    def mapper(self, _, line):
        # Don't actually yield anything for each line. Instead, collect them
        # and yield the sums when all lines have been processed. The results
        # will be collected by the reducer.
        docID, docClass,text = line.split("\t",2) 
        text = text.strip()
        text = regex.sub(' ', text.lower())
        text = re.sub( '\s+', ' ', text )
        words = text.split()
        if docID != "D5":  #skip doc d5 in chinese dataset
            if docClass == "1":
                yield("TomsPriors", "0,1")
                for word in words:
                    yield(word, "0,1")
            else:
                yield("TomsPriors", "1,0")
                for word in words:
                    yield(word, "1,0")
        

    def reducer(self, word, values):
        #aggregate counts for Pr(Word|Class)
        #yield("number of values for "+word, str(values))
        w0Total=0
        w1Total=0
        for value in values:
            w0, w1 =  value.split(",")
            w0Total += float(w0)
            w1Total += float(w1)  
        self.modelStats[word] =  [w0Total, w1Total]

        #yield("JIMI "+word, [w0Total, w1Total])
    def reducer_final(self):       
        class0Total = 0
        class1Total = 0
        for k in self.modelStats.keys():
            if k != "TomsPriors":
                class0Total += self.modelStats[k][0]
                class1Total += self.modelStats[k][1]
        vocabularySize = len(self.modelStats.keys()) -1  #ignore TomsPriors
        #some yields to see some model internal parameters
        #yield ("defaultPrior 0 class", class0Total+vocabularySize)
        #yield ("defaultPrior 1 class", class1Total+vocabularySize)
        #yield ("count 0 class", class0Total)
        #yield ("count 1 class", class1Total)
        #yield ("vocabularySize", vocabularySize)
        
        #calculate priors 
        classCount0, classCount1 = self.modelStats.get("TomsPriors")
        del self.modelStats["TomsPriors"]
        total = classCount0 + classCount1
        yield("TomsPriors", ','.join(str(j) for j in [classCount0, classCount1, classCount0/total, classCount1/total])) 
        for k in self.modelStats.keys():
            yield(k, ','.join(str(j) for j in [self.modelStats[k][0],
                      self.modelStats[k][1],
                      self.modelStats[k][0] / class0Total,   
                      self.modelStats[k][1] / class1Total]))                         
                      # smoothing (self.modelStats[k][0] + 1) /(class0Total + vocabularySize), 
                      # smoothing (self.modelStats[k][1] +1)/(class1Total+vocabularySize)]))
                            
        #print(self.modelStats["assistance"])
        #print(vocabularySize)
        #print(class0Total, class1Total)
# The if __name__ == "__main__": 
# ... trick exists in Python so that our Python files 
# can act as either reusable modules, or as standalone programs.

if __name__ == '__main__':
    EnronNaiveBayesTrainer.run(),
                                               

Overwriting EnronNaiveBayesTrainer.py


In [49]:
!python EnronNaiveBayesTrainer.py chineseExample.txt

"TomsPriors"	"1.0,3.0,0.25,0.75"
"beijing"	"0.0,1.0,0.0,0.125"
"chinese"	"1.0,5.0,0.333333333333,0.625"
"tokyo"	"1.0,0.0,0.333333333333,0.0"
"shanghai"	"0.0,1.0,0.0,0.125"
"japan"	"1.0,0.0,0.333333333333,0.0"
"macao"	"0.0,1.0,0.0,0.125"


No configs found; falling back on auto-configuration
Creating temp directory c:\users\z030757\appdata\local\temp\EnronNaiveBayesTrainer.Z030757.20160712.160443.895000
Running step 1 of 1...
Streaming final output from c:\users\z030757\appdata\local\temp\EnronNaiveBayesTrainer.Z030757.20160712.160443.895000\output...
Removing temp directory c:\users\z030757\appdata\local\temp\EnronNaiveBayesTrainer.Z030757.20160712.160443.895000...


In [126]:
%%writefile MRNaiveBayesTrainer.py

"""An implementation of a multinomial Naive Bayes learner as an MRJob.
   This is meant as an example of why mapper_final is useful.
   
   This learning algorithm implementation can be further optimised. HOW?
   
   Use a cool pattern to do this!

"""
from mrjob.job import MRJob

class MRNaiveBayesTrainer(MRJob):

    def __init__(self, *args, **kwargs):
        super(MRNaiveBayesTrainer, self).__init__(*args, **kwargs)
        self.modelStats = {}
        self.classTotalFreq = [0, 0]
        self.vocab=0

    def jobconf(self):
        orig_jobconf = super(MRNaiveBayesTrainer, self).jobconf()        
        custom_jobconf = {
            'mapred.output.key.comparator.class': 'org.apache.hadoop.mapred.lib.KeyFieldBasedComparator',
            'mapred.text.key.comparator.options': '-k1rn',
            'mapred.reduce.tasks': '1',
        }
        combined_jobconf = orig_jobconf
        combined_jobconf.update(custom_jobconf)
        self.jobconf = combined_jobconf
        return combined_jobconf

    def mapper(self, _, line):
        # Don't actually yield anything for each line. Instead, collect them
        # and yield the sums when all lines have been processed. The results
        # will be collected by the reducer.
        docID, docClass,text = line.split("\t",2)   
        words = text.split()
        vocab = {}
        if docID != "D5":  #skip doc d5 in chinese dataset
            if docClass == "1":
                yield("TomsPriors", "0,1")
                yield("*classTotalFreq", ("0, " + str(len(words))))
                for word in words:
                    vocab[word] = 1
                    yield(word, "0,1")
            else:
                yield("TomsPriors", "1,0")
                yield("*classTotalFreq", (str(len(words)) + ", 0"))
                for word in words:
                    vocab[word] = 1
                    yield(word, "1,0")
        for k in vocab.keys():
            yield "*!"+k, "1,0"
        

    def reducer(self, word, values):    
        #aggregate counts for Pr(Word|Class)
        #yield("number of values for "+word, str(values))
        w0Total=0
        w1Total=0
        c0Total=0
        c1Total=1
        for value in values:
            w0, w1 =  value.split(",")
            w0Total += float(w0)
            w1Total += float(w1)  
        if word == "*classTotalFreq":
            self.modelStats[word] = [w0Total, w1Total]
        elif word.startswith("*!"):
            self.vocab += 1
        elif word == "TomsPriors":
            yield("TomsPriors", ','.join(str(j) for j in [w0Total,w1Total,w0Total/(w0Total+w1Total),w1Total/(w0Total+w1Total)]))
        else:
            yield(word, ','.join(str(j) for j in [w0Total,w1Total,(w0Total+1)/(self.modelStats["*classTotalFreq"][0] + self.vocab),(w1Total+1)/(self.modelStats["*classTotalFreq"][1] + self.vocab)]))
        #yield("JIMI "+word, [w0Total, w1Total])"""


if __name__ == '__main__':
    MRNaiveBayesTrainer.run()

Overwriting MRNaiveBayesTrainer.py


In [127]:
!python MRNaiveBayesTrainer.py chineseExample.txt

"Beijing"	"0.0,1.0,0.111111111111,0.142857142857"
"Chinese"	"1.0,5.0,0.222222222222,0.428571428571"
"Japan"	"1.0,0.0,0.222222222222,0.0714285714286"
"Macao"	"0.0,1.0,0.111111111111,0.142857142857"
"Shanghai"	"0.0,1.0,0.111111111111,0.142857142857"
"Tokyo"	"1.0,0.0,0.222222222222,0.0714285714286"
"TomsPriors"	"1.0,3.0,0.25,0.75"


No configs found; falling back on auto-configuration
Creating temp directory c:\users\z030757\appdata\local\temp\MRNaiveBayesTrainer.Z030757.20160712.183607.015000
Running step 1 of 1...
Streaming final output from c:\users\z030757\appdata\local\temp\MRNaiveBayesTrainer.Z030757.20160712.183607.015000\output...
Removing temp directory c:\users\z030757\appdata\local\temp\MRNaiveBayesTrainer.Z030757.20160712.183607.015000...


In [128]:
#------------------------------------------------------------------------------------
# We have two ways to run the Naive Bayes algorithm
# 1. Run using the command line (shown Above)
# 2. Run using a MRJob Runner from python (very sweet way to do business). See Here
#------------------------------------------------------------------------------------
#HW 1.3
%reload_ext autoreload
%autoreload 2


from MRNaiveBayesTrainer import MRNaiveBayesTrainer 

# STEP 1: Train a mulitnomial Naive Bayes      
trainingData = 'chineseExample.txt'

# create an instance of the Trainer class
# and initiatialize it
mr_job = MRNaiveBayesTrainer(args=[trainingData])
modelStats={}
with mr_job.make_runner() as runner: 
    runner.run()
    # stream_output: get access to the output reducer/reducer_final of 
    # the last step in MRNaiveBayesTrainer
    for line in runner.stream_output():
        key,value =  mr_job.parse_output_line(line)
        print key, value
        modelStats[key] = value            
    # Store model locally
    with open('StatelessModel1.txt', 'w') as f:
        for k in modelStats.keys():
            f.writelines( k + "\t"+ str(modelStats[k]) +"\n")
print modelStats

Beijing 0.0,1.0,0.111111111111,0.142857142857
Chinese 1.0,5.0,0.222222222222,0.428571428571
Japan 1.0,0.0,0.222222222222,0.0714285714286
Macao 0.0,1.0,0.111111111111,0.142857142857
Shanghai 0.0,1.0,0.111111111111,0.142857142857
Tokyo 1.0,0.0,0.222222222222,0.0714285714286
TomsPriors 1.0,3.0,0.25,0.75
{'Beijing': '0.0,1.0,0.111111111111,0.142857142857', 'Chinese': '1.0,5.0,0.222222222222,0.428571428571', 'Tokyo': '1.0,0.0,0.222222222222,0.0714285714286', 'Shanghai': '0.0,1.0,0.111111111111,0.142857142857', 'TomsPriors': '1.0,3.0,0.25,0.75', 'Japan': '1.0,0.0,0.222222222222,0.0714285714286', 'Macao': '0.0,1.0,0.111111111111,0.142857142857'}


#### Classifier for MR Naive Bayes.  Hasn't been updated for stateless trainer above

In [129]:
%%writefile MRNaiveBayesClassifier.py
#HW 1.3
 
from mrjob.job import MRJob
import sys, re, string, operator, math, os


regex = re.compile('[%s]' % re.escape(string.punctuation))

class MRNaiveBayesClassifier(MRJob):


    def __init__(self, *args, **kwargs):
        super(MRNaiveBayesClassifier, self).__init__(*args, **kwargs)
        self.zeroProb = 0

    def jobconf(self):
        orig_jobconf = super(MRNaiveBayesClassifier, self).jobconf()        
        custom_jobconf = {
            'mapred.output.key.comparator.class': 'org.apache.hadoop.mapred.lib.KeyFieldBasedComparator',
            'mapred.text.key.comparator.options': '-k1rn',
            'mapred.reduce.tasks': '1',
        }
        combined_jobconf = orig_jobconf
        combined_jobconf.update(custom_jobconf)
        self.jobconf = combined_jobconf
        return combined_jobconf    

    #load model from file; it has been sent from the master node to each worker node
    def mapper_init(self):
        self.modelStats = {}
        
        recordStrs = [s.split('\n')[0].split('\t') for s in open("StatelessModel1.txt").readlines()]
        for word, statsStr in recordStrs:
            self.modelStats[word] = map(float, statsStr.split(","))
        
        self.prC0 = math.log(self.modelStats["TomsPriors"][2])
        self.prC1 = math.log(self.modelStats["TomsPriors"][3])
        

    
    def mapper(self, _, line):
        
        docID, docClass,text = line.split("\t",2)
        text = text.strip()
        text = regex.sub(' ', text.lower())
        text = re.sub( '\s+', ' ', text )
        words = text.split()
        
        for word in words:
            p0 = self.modelStats[word][2]
            if self.modelStats[word][2] == 0.0:
                self.zeroProb += 1
            p1 = self.modelStats[word][3]
            if self.modelStats[word][3] == 0.0:
                self.zeroProb += 1
            wordGivenHam = math.log(p0) if p0>0.0 else math.log(1)
            wordGivenSpam = math.log(p1) if p1>0.0 else math.log(1)
            prHAMGivenDoc = self.prC0 + wordGivenHam
            prSPAMGivenDoc = self.prC1 + wordGivenSpam
        
        predictedClass = 1 #SPAM
        if(prHAMGivenDoc > prSPAMGivenDoc):
            predictedClass = 0 #HAM
        if int(docClass) == predictedClass:
            yield (docID, 0)  #no error
        else: 
            yield (docID, 1) # error    
        yield("zero", self.zeroProb)
    
    def combiner(self, word, values):
        for value in values:
            yield ("t", value)
            
    def reducer(self, word, values):
        zero = 0
        numberOfRecords = 0
        numberWrong = 0
        for value in values:
            if value > 1:
                zero = value
            else:    
                numberOfRecords += 1
                numberWrong += value
        #print (numberOfRecords, numberWrong)
        print ('Error rate: %.4f' %(1.0*numberWrong/float(numberOfRecords)))
        print ('Number Wrong %d, Total Records %d'  %(numberWrong, numberOfRecords))
        print ('number of word|class with 0 probability: %d' %(zero))



if __name__ == '__main__':
    MRNaiveBayesClassifier.run()



Overwriting MRNaiveBayesClassifier.py


In [130]:
!python MRNaiveBayesClassifier.py --jobconf mapred.reduce.tasks=1 chineseExample.txt --file=StatelessModel1.txt

No configs found; falling back on auto-configuration
Creating temp directory c:\users\z030757\appdata\local\temp\MRNaiveBayesClassifier.Z030757.20160712.183624.513000
Running step 1 of 1...
Traceback (most recent call last):
  File "MRNaiveBayesClassifier.py", line 92, in <module>
    MRNaiveBayesClassifier.run()
  File "C:\Anaconda2\lib\site-packages\mrjob\job.py", line 430, in run
    mr_job.execute()
  File "C:\Anaconda2\lib\site-packages\mrjob\job.py", line 448, in execute
    super(MRJob, self).execute()
  File "C:\Anaconda2\lib\site-packages\mrjob\launch.py", line 160, in execute
    self.run_job()
  File "C:\Anaconda2\lib\site-packages\mrjob\launch.py", line 230, in run_job
    runner.run()
  File "C:\Anaconda2\lib\site-packages\mrjob\runner.py", line 473, in run
    self._run()
  File "C:\Anaconda2\lib\site-packages\mrjob\sim.py", line 172, in _run
    self._invoke_step(step_num, 'mapper')
  File "C:\Anaconda2\lib\site-packages\mrjob\sim.py", line 259, in _invoke_step
    worki

In [161]:
import numpy as np

cube = 1
square = 0
n1 = -12
c = 1

x = -1
y = (x**3)-(12*x)+1

p = np.poly1d([cube,square,n1,c])
p2 = np.polyder(p)
print p2

m = p2(x)

b = -m*x+y


tanline = m*x + b
print tanline


print ("y = "+str(m)+" * "+str(x)+" + "+str(b))

   2
3 x - 12
12
y = -9 * -1 + 3


In [174]:
import numpy as np

cube = 1.0
square = 0.0
n1 = 2.0
c = -4.0
x = 1.0
y = x**3+2*x-4
p = np.poly1d([cube,square,n1,c])
p2 = np.polyder(p)

yprime = p2(x)

if y>0.001:
    nr = -1.0*(y/yprime)+x
    x = nr
else:
    print nr






1.2


In [10]:
#basic Newton-Raphson in python:
import numpy as np
from sympy import *
def f(x):
    return x**3 + 2*x - 4
def f2(x):
    return 3*x + 2

def nr_meth(a):
    nr = a
    i = 0
    cnt = 0
    while i != nr:
        i = nr
        cnt += 1
        n = f(nr)*1.0
        d = f2(nr)*1.0
        nr = -1.0* (n/d) + nr
        print cnt, nr

nr_meth(1)

1 1.2
2 1.17714285714
3 1.17978019902
4 1.17947792181
5 1.17951259166
6 1.17950861551
7 1.17950907152
8 1.17950901922
9 1.17950902522
10 1.17950902453
11 1.17950902461
12 1.1795090246
13 1.1795090246
14 1.1795090246
15 1.1795090246
16 1.1795090246
17 1.1795090246


In [16]:
from sympy import *
import numpy as np
x = Symbol('x')
y = x**2 + 1
yprime = y.diff(x)
yprime


2*x