# Machine Learning 3/5: Naive Bayes Classifier
https://www.youtube.com/watch?v=j3IGd5CjsVA

In [6]:
import math
import random
import re
import sys
from collections import Counter, defaultdict

In [7]:
def tokenize(text):
    '''Break up text into words'''
    return re.findall('[a-z0-9]+', text)

In [8]:
def tokenize_title_body(title, body):
    '''Break up text into title and body that do not overlap'''
    return ["title:" + t for t in tokenize(title)] + ["body:" + b for b in tokenize(body)]

Could also tokenize based on bigrams

In [9]:
def read_training_file(filename):
    priors = Counter()
    likelihood = defaultdict(Counter)
    
    with open(filename) as f:
        for line in f:
            parts = line.split('\t')
            priors[parts[1]] += 1
#             for word in tokenize(parts[2]):
#             for word in tokenize(parts[2] + " " + parts[3]):   # Add title, body   
            for word in tokenize_title_body(parts[2], parts[3]):   # Split title, body                     
                likelihood[parts[1]][word]+= 1
    
    return (priors, likelihood)

In [10]:
# def classify(line, priors, likelihood):
#     '''Return a random category'''
#     categories = priors.keys()
#     return categories[int(random.random() * len(categories))]

In [11]:
def classify_random(line, priors, likelihood):
    '''Return a random category'''
    categories = priors.keys()
    return categories[int(random.random() * len(categories))]

In [12]:
def classify_max_prior(line, priors, likelihood):
    '''Return biggest category'''
    return max(priors, key=lambda x: priors[x])

In [13]:
def classify_bayesian(line, priors, likelihood):
    '''Return the class that maximizes the posterior'''
    max_class = (-1E6, '')
    for c in priors.keys():
#         p = priors[c]
        p = math.log(priors[c])
        n = float(sum(likelihood[c].values()))
#         for word in tokenize(line[2]):
#         for word in tokenize(line[2] + " " + line[3]):  # Add title, body  
        for word in tokenize_title_body(line[2], line[3]):  # Split title, body                    
#             p = p * likelihood[c][word] / n
#             p = p * max(1E-6, likelihood[c][word] / n)  # Zero fix (can try 1E-2, 6, 9, etc; 9 eventually gave us more 0s, fix with logs
#             p = p + math.log(max(1E-9, likelihood[c][word] / n))
            if word.startswith('title'):
                p = p + math.log(max(1E-9, 2 * likelihood[c][word] / n))
            else:
                p = p + math.log(max(1E-9, likelihood[c][word] / n))
            
        if p > max_class[0]:
            max_class = (p, c)
    
#     print(max_class)
    return max_class[1]

In [14]:
def read_testing_file(filename):
    return [line.strip().split('\t') for line in open(filename).readlines()]

In [15]:
def main():
    training_file = sys.argv[1]
    testing_file = sys.argv[2]
    
    (priors, likelihood) = read_training_file(training_file)
#     print(priors)
#     print(likelihood)
    lines = read_testing_file(testing_file)
    for line in lines:
#         if classify(line, priors, likelihood) == line[1]:
#         if classify_max_prior(line, priors, likelihood) == line[1]:
        if classify_bayesian(line, priors, likelihood) == line[1]:     
            num_correct += 1
            
    print("Classified %d correctly out of %d for an accuracy of %f"%(num_correct, len(lines), float(num_correct)/len(lines)))

In [None]:
if __name__ == '__main__':
    main()