# Settings

In [20]:
import pandas as pd
import numpy as np

# 0. Bigram Model (Chapter 3)

In [6]:
class Bigram:
    def __init__(self):
        # Pseudowords
        self.pseudo_init = '<s>'; self.pseudo_termin = '</s>'
        self.corpus_sequence = []
        self.token_count = {self.pseudo_init:0, self.pseudo_termin:0}
        self.bigram_count = {self.pseudo_init:{self.pseudo_init:0, self.pseudo_termin:0}, 
                             self.pseudo_termin:{self.pseudo_init:0, self.pseudo_termin:0}}

    """ 
        Preprocessing
            - preprocess any paragraph into a sequence of tokens
    """
    def preprocess_corpus(self, paragraph):
        corpus_sequence = []
        for sentence in (paragraph.split('.')): # Divide by sentences
            if (sentence == '') : corpus_sequence.append(self.pseudo_termin); break # Ignore the void sentence after the last period
            corpus_sequence.append(self.pseudo_init) # Add a pseudo_init before every sentence.
            for word in ((sentence.strip()).split(' ')): # Remove spaces and divide by words
                corpus_sequence.append(word) # Add all word to the sequence
            corpus_sequence.append(self.pseudo_termin) # Add a pseudo_termin after every sentence
        return corpus_sequence[:-1] # In case the given paragraph doesn't end with a full sentence
    
    def preprocess_target(self, target):
        target_sequence = []
        for token in target.split(' '):
            target_sequence.append(token)
        return target_sequence
                
                
            
    """
        Construct Bigram Count
            - from the corpus sequence, construct bigram count table
            - self.corpus_sequence should have been constructed in advance
    """
    def count_bigram(self):
        prev_token = self.pseudo_init; self.token_count[self.pseudo_init] += 1;
        for token in self.corpus_sequence[1:]:
            if token in self.token_count:
                self.token_count[token] += 1
            else: 
                self.token_count[token] = 1
                # Add a new row(of token) to the bigram_count graph
                self.bigram_count[token] = {token:0}
                for existing_token in self.bigram_count.keys():
                    self.bigram_count[token][existing_token] = 0
                    # Add a new column to the bigram_count graph
                    self.bigram_count[existing_token][token] = 0
            self.bigram_count[prev_token][token] += 1
            prev_token = token
    
    def display_bigram_count(self):
        display(pd.DataFrame(self.bigram_count).transpose())
    
    def train(self, corpus, show_table=False):
        self.corpus_sequence = self.preprocess_corpus(corpus)
        self.count_bigram()
        if (show_table == True):
            self.display_bigram_count()
    
    def calculate_bigram(self, target):
        # 1. Input Processing
        target_sequence = self.preprocess_target(target)
        
        # 2. Bigram Count
        #V = len(self.token_count.keys()) # Number of tokens (including pseudowords)
        
        n = len(target_sequence)
        
        p_total = 1
        for k in range(1,n):
            w_k = target_sequence[k]; w_kminus1 = target_sequence[k-1]
            p_k_given_kminus1 = self.bigram_count[w_kminus1][w_k] / self.token_count[w_kminus1]
            p_total *= p_k_given_kminus1
        
        return p_total

## 0.1 Train

In [7]:
corpus = "I am Sam. Sam I am. I do not like green eggs and ham."

bigram_model = Bigram()
bigram_model.train(corpus, show_table=True)

Unnamed: 0,<s>,</s>,I,am,Sam,do,not,like,green,eggs,and,ham
<s>,0,0,2,0,1,0,0,0,0,0,0,0
</s>,2,0,0,0,0,0,0,0,0,0,0,0
I,0,0,0,2,0,1,0,0,0,0,0,0
am,0,1,0,0,1,0,0,0,0,0,0,0
Sam,0,1,1,0,0,0,0,0,0,0,0,0
do,0,0,0,0,0,0,1,0,0,0,0,0
not,0,0,0,0,0,0,0,1,0,0,0,0
like,0,0,0,0,0,0,0,0,1,0,0,0
green,0,0,0,0,0,0,0,0,0,1,0,0
eggs,0,0,0,0,0,0,0,0,0,0,1,0


## 0.2 Test

In [9]:
targets = ["<s> I", "<s> Sam", "I am",
           "Sam </s>", "am Sam", "I do"]

for target in targets:
    print("P(" + target.split()[-1] + "|" + target.split()[0] + ") = " + str(round(bigram_model.calculate_bigram(target),2)))

P(I|<s>) = 0.67
P(Sam|<s>) = 0.33
P(am|I) = 0.67
P(</s>|Sam) = 0.5
P(Sam|am) = 0.5
P(do|I) = 0.33


# 1. Naive Bayes Classifier (Chapter 4)

In [175]:
class NaiveBayes:
    def __init__(self):
        import pandas as pd; import numpy as np; import math
        self.classes = set(); self.features = set()
        self.class_cnt = {}; self.feature_cnt = {}
        self.data_cnt = None; self.X_shape = None;
        
    def fit(self, X_train, y_train):
        self._count(X_train, y_train)
    
    def predict(self, X_test):
        # 0. Validate Input
        if (X_test.shape[1:] != self.X_shape):
            raise Exception("Match the data size of X_test")
        y_predict = []
        for X in X_test:
            y_predict.append(self._predict_single(X))
        return y_predict
    
    def _predict_single(self, X):
        probability = {}
        for label in self.classes:
            probability[label] = self._log_probability(X, label)
        return max(probability, key=probability.get)
    
    """
        Count the number of..
        
        1. Classes throughout the whole dataset
            - update self.class_cnt
        
        2. Feature occurence per class
            - update self.feature_cnt
    """
    def _count(self, X_train, y_train):
        # 0. Validate Input
        self.data_cnt = len(y_train);
        if (len(X_train) != self.data_cnt): 
            raise Exception("Match the data size of X_train and y_train")
        else:
            self.X_shape = X_train.shape[1:]
        
        for i in range(self.data_cnt):
            X = X_train[i]; y = y_train[i]
            
            ### [Update Class Count] ###################################################
            
            # 1. Check if a new class appears
            if (y not in self.classes):
                # 1.1 Add the class to class set and self.class_cnt
                self.classes.add(y) 
                self.class_cnt[y] = 0
                # 1.2 Create a column for the class in count table
                self.feature_cnt[y] = dict(zip(list(self.features), np.zeros(len(self.features), dtype=int))) 
            
            # 2. Update the class count
            self.class_cnt[y] += 1
            
            ############################################################################
            
            ### [Update Feature Count] #################################################
            
            for x_i in X:
                # 1. Check if a new feature appears
                if (x_i not in self.features):
                    # 1.1 Add the feature to the feature set
                    self.features.add(x_i)
                    # 1.2 Add a row for the feature in count table
                    for _class in self.feature_cnt.keys():
                        self.feature_cnt[_class][x_i] = 0
                        
                # 2. Update the feature count
                self.feature_cnt[y][x_i] += 1
            
            ############################################################################
        return
    
    """
        Compute P(X|y)
        
        1. term1: P(y)
        2. term2: P(X|y) -> with Laplace Smoothing
            
    """
    def _log_probability(self, X, y):
        term1 = math.log(self.class_cnt[y] / self.data_cnt, 10)
        
        term2 = 0
        features_sum = sum(self.feature_cnt[y].values()) + len(self.features)
        for x_i in X:
            try:
                term2 += math.log(self.feature_cnt[y][x_i] + 1, 10)
            except:
                term2 = 0
        
        return term1 + term2

In [176]:
X_train = np.array([["A", "B", "C"], ["A", "B", "B"], ["D", "F", "A"], ["C", "B", "B"]])
y_train = np.array(["Pass", "Pass", "Fail", "Fail"])

X_test1 = np.array([["A", "C", "B"],
                    ["A", "C", "X"],  # Unseen data -> smoothed by Laplace
                    ["F", "D", "B"],
                    ["X", "Y", "Z"]]) # Unseen data -> smoothed by Laplace

In [177]:
classifier = NaiveBayes()
classifier.fit(X_train, y_train)

classifier.predict(X_test1)

['Pass', 'Fail', 'Fail', 'Fail']

# 2. Logistic Regression Classifier (Chapter 5)

In [None]:
class LogisticRegression:
    def __init__(self):
        self.classes = set()
    
    def fit(self, X_train, y_train):
        