In [None]:
## Bayes for nlp

import re
import collections
import numpy as np
import math

def tokenize(message):
    message = message.lower()
    all_words = re.findall("[a-z0-9']+", message)
    return set(all_words)

"""training set consists of pairs (message, is_spam)"""
def count_words(training_set):
    counts = collections.defaultdict(lambda: [0, 0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1
    return counts

"""turn the word_counts into a list of triplets - w, p(w | spam) and p(w | ~spam)"""
def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
    return [(w, (spam + k) / (total_spams + 2 * k), (non_spam + k) / (total_non_spams + 2 * k)) 
            for w, (spam, non_spam) in counts.items()]

def spam_probability(word_probs, message):
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0.0
    for word, prob_if_spam, prob_if_not_spam in word_probs:
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)
                
    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

class NaiveBayesClassifier:
    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []
    def train(self, training_set):
        num_spams = len([is_spam
                            for message, is_spam in training_set
                            if is_spam])
        num_non_spams = len(training_set) - num_spams
        
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts,
                                                num_spams,
                                                num_non_spams,
                                                self.k)
    def classify(self, message):
        return spam_probability(self.word_probs, message)


from sklearn.model_selection import train_test_split
import pickle
import numpy as np

authors_file = "data/email_authors.pkl"
authors_file_handler = open(authors_file, "rb")
authors = pickle.load(authors_file_handler)
authors_file_handler.close()

words_file = "data/word_data.pkl"
words_file_handler = open(words_file, "rb")
word_data = pickle.load(words_file_handler)
words_file_handler.close()

features_train, features_test, labels_train, labels_test = train_test_split(word_data, authors, test_size=0.1, random_state=42)

train_data = list(zip(features_train, labels_train))
test_data = list(zip(features_test, labels_test))

classifier = NaiveBayesClassifier()
classifier.train(train_data[:100])

classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data[:100]]

from numpy import linalg as LA

text = features_train[:30]

def vector_counts(text):
    n_doc = len(text)
    counts = collections.defaultdict(lambda: [0]*n_doc)
    for n,m in enumerate(text):
        for word in tokenize(m):
                counts[word][n] += 1
    mx = np.array([k for k in counts.values()]).T
    return mx

def Tfidf_transformer(vector):
    n_d = vector.shape[0]
    df_t = np.count_nonzero(vector, axis=0)
    idf_t = np.log(n_d/df_t) + 1
    tf_idf = vector * idf_t[None,:]
    tf_idf = np.divide(tf_idf, LA.norm(tf_idf, axis=1)[:,None])
    return tf_idf
    
Tfidf_transformer(vector_counts(text))

In [None]:
## from decision tree
# coding: utf-8

# In[164]:

from __future__ import print_function
import pandas as pd
from random import randrange


# In[123]:

header = ["color", "diameter", "label"]
training_data = [
    ['Green', 3, 'Apple'],
    ['Yellow', 3, 'Apple'],
    ['Red', 1, 'Grape'],
    ['Red', 1, 'Grape'],
    ['Yellow', 3, 'Lemon']
]

pd.DataFrame(training_data, columns=header)


# In[124]:

def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float)


# In[125]:

## Ask question
class Question:
    def __init__(self, column, value):
        self.column = column
        self.value = value

    def match(self, example):
        val = example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value

    def __repr__(self):
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s?" %(header[self.column], condition, str(self.value))


# In[126]:

Question(0, 'red'), Question(1, 2)


# In[127]:

## Partition a dataset.
def partition(rows, question):
    true_rows, false_rows = [], []
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows


# In[128]:

q = Question(1, 2)
true_rows, false_rows = partition(training_data, q)
print("True rows: %s" %true_rows, "\nFalse rows: %s" %false_rows)


# In[129]:

## Count labels for each class
def class_counts(rows):
    counts = {}
    for row in rows:
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts


# In[220]:

A = class_counts(training_data)
print(A)


# In[222]:

max(set(A), key=A.get)


# In[131]:

## Gini impurity defined from -- https://en.wikipedia.org/wiki/Decision_tree_learning#Gini_impurity
## Gini_impurity = 1 - Sum(p**2)
def gini(rows):
    counts = class_counts(rows)
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(rows))
        impurity -= prob_of_lbl**2
    return impurity


# In[132]:

gini(training_data)


# In[133]:

## Information gain is used to decide which feature to split on at each step in building the tree.
## Information gain = entropy(parent) - entropy(children)

def info_gain(left, right, current_uncertainty):
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - (p * gini(left) + (1 - p) * gini(right))


# In[134]:

current_uncertainty = gini(training_data)
true_rows, false_rows = partition(training_data, Question(0, 'Green'))
info_gain(true_rows, false_rows, current_uncertainty)


# In[135]:

## Find best split (question) with largest information gain
def find_best_split(rows):
    best_gain = 0
    best_question = None
    current_uncertainty = gini(rows)
    n_features = len(rows[0]) - 1

    for col in range(n_features):
        values = set([row[col] for row in rows])
        for val in values:
            question = Question(col, val)
            true_rows, false_rows = partition(rows, question)
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue
            gain = info_gain(true_rows, false_rows, current_uncertainty)
            if gain >= best_gain:
                best_gain, best_question = gain, question

    return best_gain, best_question


# In[138]:

best_gain, best_question = find_best_split(training_data)
best_gain, best_question


# In[139]:

## Define leaf and node
class Leaf:
    def __init__(self, rows):
        self.predictions = class_counts(rows)
        
class Decision_Node:
    def __init__(self, question, true_branch, false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch


# In[140]:

## Build the tree, split until the information gain = 0
## This is the main part
def build_tree(rows):
    gain, question = find_best_split(rows)
    if gain == 0:
        return Leaf(rows)
    true_rows, false_rows = partition(rows, question)
    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)
    return Decision_Node(question, true_branch, false_branch)


# In[141]:

## Print the whole tree
def print_tree(node, spacing=""):
    if isinstance(node, Leaf):
        print (spacing + "Predict", node.predictions)
        return
    print (spacing + str(node.question))
    print (spacing + '--> True:')
    print_tree(node.true_branch, spacing + "  ")
    print (spacing + '--> False:')
    print_tree(node.false_branch, spacing + "  ")


# In[156]:

my_tree = build_tree(training_data)


# In[151]:

print_tree(my_tree)


# In[157]:

def classify(row, node):
    if isinstance(node, Leaf):
        return node.predictions
    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)


# In[210]:

classify(['Green', 3, 'Apple'], my_tree)


# In[158]:

def print_leaf(counts):
    total = sum(counts.values()) * 1.0
    probs = {}
    for lbl in counts.keys():
        probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"
    return probs


# In[159]:

testing_data = [
    ['Green', 3, 'Apple'],
    ['Yellow', 4, 'Apple'],
    ['Red', 2, 'Grape'],
    ['Red', 1, 'Grape'],
    ['Yellow', 3, 'Lemon']]


# In[160]:

for row in testing_data:
    print ("Actual: %s. Predicted: %s" %
           (row[-1], print_leaf(classify(row, my_tree))))


# In[244]:

## Bagging
def subsample(dataset, n_sample):
    sample = list()
#    n_sample = round(len(dataset) *1.0 / n_samples)
    print(n_sample)
    while len(sample) < n_sample:
        index = randrange(len(dataset))
        sample.append(dataset[index])
    return sample

def bagging_classify(row, trees):
    c = dict()
    [c.update(classify(row, tree)) for tree in trees]
    return max(c)
#return c


# In[245]:

def bagging(train, test, sample_size, n_trees):
    trees = list()
    for i in range(n_trees):
        sample = subsample(train, sample_size)
        tree = build_tree(sample)
        print_tree(tree)
        trees.append(tree)
    classes = [bagging_classify(row, trees) for row in test]
    return classes


# In[249]:

bagging(training_data, testing_data, 3, 2)


# In[ ]:


## Neural network for Scratch
## Creat a XOR(“or, but not and”) gate by neural network

import math

def dot(v, w):
    """v_1 * w_1 + ... + v_n * w_n"""
    return sum(v_i * w_i for v_i, w_i in zip(v, w))

def perceptron_output(weights, bias, x):
    """returns 1 if the perceptron 'fires', 0 if not"""
    calculation = dot(weights, x) + bias
    return step_function(calculation)

def sigmoid(t):
    return 1 / (1 + math.exp(-t))

def neuron_output(weights, inputs):
    return sigmoid(dot(weights, inputs))

def feed_forward(neural_network, input_vector):
    outputs = []
       # process one layer at a time
    for layer in neural_network:
        input_with_bias = input_vector + [1]              # add a bias input
        output = [neuron_output(neuron, input_with_bias)  # compute the output
                     for neuron in layer]                    # for each neuron
        outputs.append(output)                            # and remember it
           # then the input to the next layer is the output of this one
        input_vector = output
    return outputs

xor_network = [# hidden layer
               [[20, 20, -30],
                [20, 20, -10]],
               # output layer
               [[-60, 60, -30]]]
for x in [0, 1]:
    for y in [0, 1]:
        print x, y, feed_forward(xor_network,[x, y])[-1]

## Recognize a digit by neural network.

import random

## backpropagation
def backpropagate(network, input_vector, targets):
    hidden_outputs, outputs = feed_forward(network, input_vector)
    
    # the output * (1 - output) is from the derivative of sigmoid
    output_deltas = [output * (1 - output) * (output - target)
                        for output, target in zip(outputs, targets)]
    
    # adjust weights for output layer, one neuron at a time
    for i, output_neuron in enumerate(network[-1]):
        for j, hidden_output in enumerate(hidden_outputs + [1]):
            output_neuron[j] -= output_deltas[i] * hidden_output

    # back-propagate errors to hidden layer        
    hidden_deltas = [hidden_output * (1 - hidden_output) * 
                     dot(output_deltas, [n[i] for n in output_layer])
                     for i, hidden_output in enumerate(hidden_outputs)]
    
    # adjust weights for hidden layer, one neuron at a time
    for i, hidden_neuron in enumerate(network[0]):
        for j, input in enumerate(input_vector + [1]):
            hidden_neuron[j] -= hidden_deltas[i] * input
            
            
random.seed(0)
input_size = 25
num_hidden = 5
output_size = 10

inputs =  [[1,1,1,1,1,  
            1,0,0,0,1,  
            1,0,0,0,1,  
            1,0,0,0,1,  
            1,1,1,1,1], 
           
           [0,0,1,0,0,  
            0,0,1,0,0,  
            0,0,1,0,0,  
            0,0,1,0,0,  
            0,0,1,0,0], 
           
           [1,1,1,1,1,  
            0,0,0,0,1,  
            1,1,1,1,1,  
            1,0,0,0,0,  
            1,1,1,1,1], 
           
           [1,1,1,1,1,  
            0,0,0,0,1,  
            1,1,1,1,1,  
            0,0,0,0,1,  
            1,1,1,1,1], 
           
           [1,0,0,0,1,  
            1,0,0,0,1,  
            1,1,1,1,1,  
            0,0,0,0,1,  
            0,0,0,0,1], 
           
           [1,1,1,1,1,  
            1,0,0,0,0,  
            1,1,1,1,1,  
            0,0,0,0,1,  
            1,1,1,1,1],
           
           [1,1,1,1,1,  
            1,0,0,0,0,  
            1,1,1,1,1,  
            1,0,0,0,1,  
            1,1,1,1,1], 
           
           [1,1,1,1,1,  
            0,0,0,0,1,  
            0,0,0,0,1,  
            0,0,0,0,1,  
            0,0,0,0,1],
           
           [1,1,1,1,1,  
            1,0,0,0,1,  
            1,1,1,1,1,  
            1,0,0,0,1,  
            1,1,1,1,1],

           [1,1,1,1,1,  
            1,0,0,0,1,  
            1,1,1,1,1,  
            0,0,0,0,1,  
            1,1,1,1,1]]

#targets = 3

targets = [[1 if i == j else 0 for i in range(10)]
              for j in range(10)]

hidden_layer = [[random.random() for _ in range(input_size + 1)]
                for _ in range(num_hidden)]

output_layer = [[random.random() for _ in range(num_hidden + 1)]
                for _ in range(output_size)]

network = [hidden_layer, output_layer]

for _ in range(10000):
    for input_vector, target_vector in zip(inputs, targets):
        backpropagate(network, input_vector, target_vector)
#    backpropagate(network, inputs, targets)
        
def predict(input):
    return feed_forward(network, input)[-1]

print predict(inputs[2])

Ref. Data Science from Scratch, Joel Grus