# **``IMDB sentiment analysis with scikit-learn & custom algorithms``**

## **`0. Import modules`**

In [1]:
import tensorflow as tf
import numpy as np
import math
from tqdm import tqdm
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt 
from sklearn.model_selection import learning_curve
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd 
import random
from tensorflow.keras.utils import plot_model

## **`1. Fetch data`**

In [2]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data()

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size= 0.1)

word_index = tf.keras.datasets.imdb.get_word_index()

index2word = dict((i + 3, word) for (word, i) in word_index.items())

index2word[0] = '[pad]'
index2word[1] = '[bos]'
index2word[2] = '[oov]'

x_train = np.array([' '.join([index2word[idx] for idx in text]) for text in x_train])
x_val = np.array([' '.join([index2word[idx] for idx in text]) for text in x_val])
x_test = np.array([' '.join([index2word[idx] for idx in text]) for text in x_test])

## **`2. Create Vocabulary`**

In [3]:
def create_voc(x_train, y_train, n, k, m):
    '''
    Creates the vocabulary

    Args:
        x_train(numpy.array): the training data
        y_train(numpy.array): the category of every training data
        n(int): the most frequent n words
        m(int): the final m words that we are going to keep
        k(int): the least frequent words
    '''

    voc_dict = dict()
    for text in x_train:
        tokens = set(text.split())
        for token in tokens:
            if token in voc_dict:
                voc_dict[token] += 1
            else:
                voc_dict[token] = 1
    
    #Skip the most n frequent & k least words:
    voc = sorted(voc_dict.items(), key = lambda x:x[1])
    voc = voc[k:len(voc) - n]
    return np.array([x[0] for x in voc[len(voc) - m:]])

## **`3. Information Gain & Entropy`**

In [4]:
def IG(class_, feature):
  classes = set(class_)

  Hc = 0
  for c in classes:
    pc = list(class_).count(c)/len(class_)
    Hc += - pc * math.log(pc, 2)
  #print('Overall Entropy:', Hc)
  feature_values = set(feature)

  Hc_feature = 0
  for feat in feature_values:
    
    #pf --> P(X=x)
    pf = list(feature).count(feat)/len(feature)
    indices = [i for i in range(len(feature)) if feature[i] == feat]
    clasess_of_feat = [class_[i] for i in indices]
    for c in classes:
        #pcf --> P(C=c|X=x)
        pcf = clasess_of_feat.count(c)/len(clasess_of_feat)
        if pcf != 0:
            # - P(X=x) * P(C=c|X=x) * log2(P(C=c|X=x))
            temp_H = - pf * pcf * math.log(pcf, 2)
            #sum for all values of C (class) and X (values of specific feature)
            Hc_feature += temp_H
  ig = Hc - Hc_feature
  return ig    

## **`4. Create binary vectors`** 

In [5]:
x_train_binary = list()
x_test_binary = list()
x_val_binary = list()

vocabulary = create_voc(x_train, y_train, 50, 1000, 3000)

for text in tqdm(x_train):
  tokens = text.split()
  binary_vector = list()
  for vocab_token in vocabulary:
    if vocab_token in tokens:
      binary_vector.append(1)
    else:
      binary_vector.append(0)
  x_train_binary.append(binary_vector)

x_train_binary = np.array(x_train_binary)

for text in tqdm(x_test):
  tokens = text.split()
  binary_vector = list()
  for vocab_token in vocabulary:
    if vocab_token in tokens:
      binary_vector.append(1)
    else:
      binary_vector.append(0)
  x_test_binary.append(binary_vector)

x_test_binary = np.array(x_test_binary)

for text in tqdm(x_val):
  tokens = text.split()
  binary_vector = list()
  for vocab_token in vocabulary:
    if vocab_token in tokens:
      binary_vector.append(1)
    else:
      binary_vector.append(0)
  x_val_binary.append(binary_vector)
  
x_val_binary = np.array(x_val_binary)

100%|██████████| 22500/22500 [09:05<00:00, 41.24it/s]
100%|██████████| 25000/25000 [10:37<00:00, 39.20it/s] 
100%|██████████| 2500/2500 [00:51<00:00, 48.72it/s]


##  **`5.0 Random Forest`🌳🌳🌳🌳🌳🌳**

In [6]:
class Random_Forest:

    def __init__(self, num_trees, m, max_depth):
        '''
        Initialize the variables
        
        Args:
            num_trees(int): number of trees
            m(int): number of features
            max_depth(int): max_depth of every tree
        '''
        self.m = m
        self.num_trees = num_trees
        self.max_depth = max_depth
        self.ls_trees = []

    def subsample(self, x_data, y_data):
        '''
        Shuffle the samples.There can be also duplicate samples

        Args:
            x_data(np.array): the training data
            y_data(np.array): the category of every training data
        
        Returns:
            x_sample(np.array)
            y_sample(np.array)
        '''
        x_sample = list()
        y_sample = list()
        num_non_selected = [x for x in range(len(y_data))]
        num_selected = []

        for i in range(len(x_data)):
            random_choice = random.choice(num_non_selected)
            num_selected.append(random_choice)
            x_sample.append(x_data[random_choice])
            y_sample.append(y_data[random_choice])

        x_sample = np.array(x_sample)
        y_sample = np.array(y_sample)

        return x_sample, y_sample

    def subfeature(self, x_data):
        '''
        Shuffle the features.No duplicate feature

        Args:
            x_data(np.array): training examples

        Returns:
            feat_selected(list): the subset of features
        '''
        ls_feature = [x for x in range(len(x_data[0]))]
        feat_selected = []
        
        for i in range(self.m):
            random_f = random.choice(ls_feature)
            ls_feature.remove(random_f)
            feat_selected.append(random_f)
        
        return feat_selected
    
    def fit(self, x_train_b, y_train_b):
        '''
        Create num_trees ID3 Trees

        Args:
            x_train_b(np.array): training examples
            y_train_b(np.array): category of every training example
            max_depth(int): the max depth of every tree
        '''
        self.ls_trees.clear()
        for i in range(self.num_trees):
            id3 = ID3(self.max_depth)
            random_x, random_y = self.subsample(x_train_b, y_train_b)
            id3.tree = id3.fitting_tree(random_x, random_y, self.subfeature(random_x), 0)
            self.ls_trees.append(id3)

    def predict(self, x_test_b):
        '''
        Predict every category for every example in the given dataset with the help of the trained trees

        Args:
            x_test_b(np.array): training examples

        Returns:
            y_test_b(np.array): the category of every given example in the x_test_b
        '''
        y_test_b = []
        sum = 0
        for i in range(len(x_test_b)):
            sum += 1
            num_0 = 0
            num_1 = 0
            for k in range(self.num_trees):
                id3 = self.ls_trees[k]
                pre_category = id3.predict_sample(x_test_b[i], id3.tree)
                if pre_category == 1:
                    num_1 += 1 
                else:
                    num_0 += 1

            if num_1 > num_0:
                y_test_b.append(1) 
            else:
                y_test_b.append(0)

        y_test_b = np.array(y_test_b)
        return y_test_b

### Find the most optimized number of trees for Random Forest
- ##### Every tree will have depth equal to 5
- ##### Every tree will have 500 words to compare
Comparing 4 different random forest classifiers in order to find the most optimized number of trees.The best_acc variable stores the highest accuracy score and the best_num stores the most optimized number of trees which we will train the final algorithm.Every model has different number of trees with the same depth and number of features.

In [None]:
num_of_trees = [x for x in range(1,5,2)]

best_acc = 0.0
best_num = 1
for num_tree in num_of_trees:
    rf = Random_Forest(num_trees=num_tree, max_depth= 5, m= 500)
    rf.fit(x_train_binary, y_train)
    acc = accuracy_score(y_val, rf.predict(x_val_binary))
    if acc > best_acc:
        best_acc = acc
        best_num = num_tree

print("The best number of trees is: ", best_num)


### Find the most optimized number of features for Random Forest
- ##### Every tree will have a depth equal to 3
- ##### Every estimator will have the same number of trees
Comparing different random forest classifiers in order to find the most optimized number of features.The best_acc variable stores the highest accuracy score and the best_m stores the most optimized number of trees which we will train the final algorithm.Every model has the same number of trees with the same depth, but different number of features.

In [None]:
num_of_m = [100, 200, 300, 500]

best_acc = 0.0
best_m = 0

for num_m in num_of_m:
    rf = Random_Forest(num_trees= 3, max_depth= 3, m= num_m)
    rf.fit(x_train_binary, y_train)
    acc = accuracy_score(y_val, rf.predict(x_val_binary))
    print("finished for: ", num_m)
    if acc > best_acc:
        best_acc = acc
        best_m = num_m

print("The best number of features is: ", best_m)

## **`5.1 ID3`🌳**

In [7]:
class ID3_Tree:
    def __init__(self):
        self.tag = None
        self.feature = 'None'
        self.child_nodes = []
        self.decision = -1

    def is_leaf(self):
        if self.decision != -1:
            return True 
        else:
            return False

    def _create_child(self, node):
        self.child_nodes.append(node)

class ID3:
    def __init__(self, labels, max_depth = 10, dc=0):
        '''
        Initialization

        Args:
            labels(set): the labels in a set.Every label is a index of a column
            max_depth(int): the max_depth to counter overfitting
            dc(int): pre-decision
        '''
        self.max_depth = max_depth
        self.tree = None
        self.labels = labels
        self.dc = dc 
        
    def most_IG(self, x_train_i, y_train_i, labels):
        '''
        Finds the feature that is the best classifier

        Args:
            x_train_i(numpy.array): the training data 
            y_train_i(numpy.array): the category for every example in the training data
            labels(set): every index(column) of every feature 

        Returns:
            max_feature(int): the index with the most valuable feature in the set labels.It presents the specific column
        '''
        max_gain = -1
        max_feature = -1

        for f in labels:
            #The training examples of a specific column (a column with a constant feature)
            x_feature = [x_train_i[example][f] for example in range(len(x_train_i))]
            feature_ig = IG(y_train_i, x_feature)
            
            if (feature_ig > max_gain):
                max_gain = feature_ig
                max_feature = f

        return max_feature

    def fit(self, x_train_b, y_train_b):
        self.tree = self.fitting_tree(x_train_b, y_train_b, self.labels, self.dc)

    def fitting_tree(self, x_train_b, y_train_b, labels, dc, depth = 0):
        '''
        Creates a decision tree with the given training data

        Args: 
            x_train_b(numpy.array): the training data with every example
            y_train_b(numpy.array): the category of every example in the training data
            labels(set): every index(column) of every feature 
            dc(int): the pre-decided category.It is 0 or 1

        Returns:
            tree(ID3_Tree): the tree that will decide the category for every case
        '''
        uniques, counts = np.unique(y_train_b, return_counts=True)
        #If there is no available training example
        if len(y_train_b) == 0:
            #Return the pre-decided category because there are no more available examples
            node = ID3_Tree()
            node.decision = dc 
            return node 

        #If there is only one category
        if len(set(y_train_b)) == 1:
            #Return the category that has been left
            node = ID3_Tree()
            node.decision = y_train_b[0]
            return node

        #If there is no feature to create new nodes
        if len(labels) == 0:
            #Return the category with the highest frequency
            node = ID3_Tree()
            node.decision = 0 if counts[0] > counts[1] else 1
            return node

        #Pruning

        #Reach Max Depth
        if depth == self.max_depth:
            node = ID3_Tree()
            if counts[0] > counts[1]:
                node.decision = 0
            elif counts[0] < counts[1]:
                node.decision = 1
            else:
                node.decision = dc
            return node

        #If the positives are 95% or above
        if float(counts[1])/float(len(y_train_b)) >= 0.95:
            node = ID3_Tree()
            node.decision = 1
            return node
            
        #If the negatives are 95% or above
        if float(counts[0])/float(len(y_train_b)) >= 0.95:
            node = ID3_Tree()
            node.decision = 0
            return node

        #Root node->feature
        best_feat = self.most_IG(x_train_b, y_train_b, labels)
        tree = ID3_Tree()
        
        #Find the most frequent category among the given training examples
        if counts[0] > counts[1]:
            m = 0
        elif counts[0] < counts[1]:
            m = 1
        else:
            m = dc

        #Find every possible value of the given feature
        f_set = set()
        for eg in range(len(x_train_b)):
            f_set.add(x_train_b[eg][best_feat])

        #print('---->',labels.copy().remove(best_feat))
        for bf_value in f_set:
            #Examples with the best_feature == bf_value
            x_examples = np.array([x_train_b[eg] for eg in range(len(x_train_b)) if x_train_b[eg][best_feat] == bf_value])
            y_examples = np.array([y_train_b[eg] for eg in range(len(x_train_b)) if x_train_b[eg][best_feat] == bf_value])

            #Create a subtree with only the given examples with x_train_b[row][col = best_feat] == bf_value
            new_labels = labels.copy()
            new_labels.remove(best_feat)
            
            sub_tree = self.fitting_tree(x_examples, y_examples, new_labels, m, depth+1)
            sub_tree.tag = bf_value
            sub_tree.feature = best_feat
            tree._create_child(sub_tree)
        
        return tree

    def predict_sample(self, x_sample, tree):
        decided = False
        sub_tree = tree
        while not decided:
            feature = sub_tree.child_nodes[0].feature 
            for sub in sub_tree.child_nodes:
                if sub.tag == x_sample[feature]:
                    sub_tree = sub
            if sub_tree.decision != -1:
                decided = True
        return sub_tree.decision

    def predict(self, x_train_b):
        '''
        Tests all the training examples with the help of the decision tree and returns for every example the predicted category in an array

        Args:
            tree(ID3_Tree): A tree that was produced for the method fit
            x_train_b(numpy.array): the data that will be tested

        Returns:
            y_train_b(numpy.array): an array with the results/category for every given example
        '''
        tree = self.tree
        y_train_b = list()
        for i in range(len(x_train_b)):
            y_train_b.append(self.predict_sample(x_train_b[i], tree))
        
        y_train_b = np.array(y_train_b)
        return y_train_b


### Find the most optimized depth for ID3
Comparing different ID3 initializations in order to find the most optimized depth for ID3 tree.The best_acc variable stores the highest accuracy score and the best_depth stores the most optimized depth which we will train the final algorithm.

In [None]:
ls_depth = [x for x in range(1, 11)]

best_acc = 0.0
best_depth = 0

for depth in ls_depth:
    id3 = ID3(labels= set([x for x in range(1, len(x_train_binary[0]))]), max_depth= depth)
    id3.fit(x_train_binary, y_train)
    acc = accuracy_score(y_val, id3.predict(x_val))
    if acc > best_acc:
        best_acc = acc
        best_depth = depth

print("The best number of depth is: ", best_depth)

## **`5.2 Naive Bayes`**

In [8]:
class Naive_Bayes:

    def __init__(self, alpha= 1):
        '''
        Initialization of the arrays

        Contains:
            x1c0(list): here are the possibilities that given the category c = 0, if x(i) = 1 then we store the possibility p(x(i) = 1 | c = 0).
                        we can find easily the p(x(i) = 0 | c = 0) == 1.0 - p(x(i) = 1 | c = 0)
            x1c1(list): here are the possibilities that given the category c = 1, if x(i) = 1 then we store the possibility p(x(i) = 1 | c = 1).
                        we can find easily the p(x(i) = 0 | c = 1) == 1.0 - p(x(i) = 1 | c = 1)
            pc0(float): the possibility of the category to be zero (c = 0 -> negative report).
                        we can calculate the p(c = 1) == 1.0 - p(c = 0)
            alpha(int): The Laplace estimator
        '''
        self.x1c0_array = list()
        self.x1c1_array = list()
        self.pc0 = 0.0
        self.alpha = alpha
    
    def fit(self, x_train_binary, y_train):
        '''
        Calculate the possibilities and store them in the suitable array

        Args: 
            x_train_binary(np.array): the training data 
            y_train_binary(np.array): the category for every example in the training data
        '''
        self.x1c0_array.clear()
        self.x1c1_array.clear()
        number_of_features = x_train_binary[0].shape[0]
        number_of_examples = y_train.shape[0]

        num_of_c0 = 0.0
        for eg in y_train:
            if eg == 0:
                num_of_c0 += 1.0
        self.pc0 = float(num_of_c0 / number_of_examples)
        
        # print(f'c0:{self.pc0} and c1:{1.0 - self.pc0}')
        for feature in range(number_of_features):
            #Laplace estimator
            self.x1c0_array.append(1.0)
            self.x1c1_array.append(1.0)

            for eg in range(number_of_examples):
                if (y_train[eg] == 0 and x_train_binary[eg][feature] == 1):
                    self.x1c0_array[feature] += 1.0
                elif (y_train[eg] == 1 and x_train_binary[eg][feature] == 1):
                    self.x1c1_array[feature] += 1.0
        
        self.x1c0_array = [float((x + self.alpha)/ (num_of_c0 + 2*self.alpha)) for x in self.x1c0_array]
        self.x1c1_array = [float((x + self.alpha)/ (number_of_examples - num_of_c0 + 2*self.alpha)) for x in self.x1c1_array]

    def predict(self, x_train_binary):
        '''
        Tests all the training examples with the arrays x1c0, x1c0 and returns for every example the predicted category in an array

        Args:
            x_train_binary(numpy.array): the data that will be tested

        Returns:
            y_array(numpy.array): an array with the results/category for every given example
        '''

        x = np.vstack((self.x1c0_array, self.x1c1_array))
        x1 = np.log(x)
        x0 = np.log(1.0 - x)
        x_train_ = x_train_binary.T
        var = np.matmul(x1, x_train_) + np.matmul(x0, 1 - x_train_)
        y_array = np.argmax(var, axis=0)

        return y_array


## **```5.3 RNN (BigRu) ```**

In [44]:
class RNN:

  def __init__(self, vocabulary, VOCAB_SIZE = 100000, SEQ_MAX_LENGTH = 250, epochs=3, verbose=1, batch_size=32):
    self.vectorizer = self.create_vec(VOCAB_SIZE, SEQ_MAX_LENGTH, vocabulary)
    self.imdb_bigru = self.get_bigru()
    self.imdb_bigru.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=tf.keras.optimizers.Adam(), metrics=['binary_accuracy'])
    self.epochs = epochs
    self.verbose = verbose
    self.batch_size = batch_size

  def create_vec(self, VOCAB_SIZE, SEQ_MAX_LENGTH, vocabulary):
    train_doc_length = 0
    for doc in x_train:
      tokens = str(doc).split()
      train_doc_length += len(tokens)

    vectorizer = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE, output_mode='int', ngrams=1, name='vector_text', output_sequence_length=SEQ_MAX_LENGTH, vocabulary= vocabulary)

    return vectorizer

  def get_bigru(self, num_layers=1, emb_size=64, h_size=64):
    inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string, name='txt_input')
    x = self.vectorizer(inputs)
    x = tf.keras.layers.Embedding(input_dim=len(self.vectorizer.get_vocabulary()), output_dim=emb_size, name='word_embeddings', mask_zero=True)(x)
    for n in range(num_layers):
      if n != num_layers - 1:
        x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(units=h_size, name=f'bigru_cell_{n}', return_sequences=True, dropout=0.2))(x)
      else:
        x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(units=h_size, name=f'bigru_cell_{n}', dropout=0.2))(x)

    x = tf.keras.layers.Dropout(rate=0.5)(x)
    o = tf.keras.layers.Dense(units=1, activation='sigmoid', name='lr')(x)
    
    return tf.keras.models.Model(inputs=inputs, outputs=o, name='simple_rnn')

  
  def fit(self, x_train_b, y_train_b):
    self.imdb_bigru = self.get_bigru()
    self.imdb_bigru.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=tf.keras.optimizers.Adam(), metrics=['binary_accuracy'])
    self.imdb_bigru.fit(x= x_train_b, y=y_train_b, epochs=self.epochs, verbose=self.verbose, batch_size=self.batch_size)

  def predict(self, x_test_b):
    return np.round(self.imdb_bigru.predict(x_test_b))

  def get_history(self, x_train_b, y_train_b, validation_split= 0.2):
    return self.imdb_bigru.fit(x= x_train_b, y= y_train_b, verbose= self.verbose, batch_size=self.batch_size, epochs=self.epochs, validation_split= validation_split)

## **Curves** :
- > #### ``custom_curve``: 
     >> #### &ensp;&thinsp; `Args`:{**estimator**= classifier Object, **x_train**, **y_train**, **x_test**, **y_test**, **n_splits**, **title** = "*< classifier name >*", **zoom_out (default= False)**}
     >> #### &ensp;&thinsp; `Returns`:{**data(dictionary)**= all the information of the classifier to create the tables later}
- > #### ``compare_two_classification_algorithms``: 
     >> #### &ensp;&thinsp; `Args`:{**estimator1**= classifier Object, **estimator2**- classifier Object, **x_train**, **y_train**, **x_test**, **y_test**, **n_splits**, **title1** = "*< classifier 1 name >*", **title2** = "*< classifier 2 name >*", **zoom_out (default= False)**}
     >> #### &ensp;&thinsp; `Returns`:{**data1(dicionary), data2(dictionary)**= all the information about the 1st & 2nd classifier to create the tables later}
- > #### ``loss_plot``:
     >> #### &ensp;&thinsp; `Args`{**his**= history of the RNN as the result of the fit method, **kind**= *"loss"*}


In [9]:
def custom_curve(estimator, x_train, y_train, x_test, y_test, n_splits, title, zoom_out= False):

  split_size = int(len(x_train) / n_splits)
  x_splits = np.split(x_train, n_splits) # must be equal division
  y_splits = np.split(y_train, n_splits)

  train_acc, test_acc, train_prec, test_prec, train_rec, test_rec, train_f, test_f = list(), list(), list(), list(), list(), list(), list(), list()

  fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(8, 8), dpi=100, gridspec_kw={'width_ratios': [1, 1], 'height_ratios': [1, 1]})
  fig.suptitle("Plots for {title}".format(title=title), fontsize = 16)

  curr_x = x_splits[0]
  curr_y = y_splits[0]
  
  estimator.fit(curr_x, curr_y)

  train_predict = estimator.predict(curr_x)
  test_predict = estimator.predict(x_test)

  train_acc.append(accuracy_score(curr_y, train_predict))
  test_acc.append(accuracy_score(y_test, test_predict))

  train_prec.append(precision_score(curr_y, train_predict))
  test_prec.append(precision_score(y_test, test_predict))

  train_rec.append(recall_score(curr_y, train_predict))
  test_rec.append(recall_score(y_test, test_predict))

  train_f.append(f1_score(curr_y, train_predict))
  test_f.append(f1_score(y_test, test_predict))

  for i in range(1, len(x_splits)):
    print("Split training examples : " ,i)
    curr_x = np.concatenate((curr_x, x_splits[i]), axis=0)
    curr_y = np.concatenate((curr_y, y_splits[i]), axis=0)
    estimator.fit(curr_x, curr_y)

    train_predict = estimator.predict(curr_x)
    test_predict = estimator.predict(x_test)

    train_acc.append(accuracy_score(curr_y, train_predict))
    test_acc.append(accuracy_score(y_test, test_predict))

    train_prec.append(precision_score(curr_y, train_predict))
    test_prec.append(precision_score(y_test, test_predict))

    train_rec.append(recall_score(curr_y, train_predict))
    test_rec.append(recall_score(y_test, test_predict))

    train_f.append(f1_score(curr_y, train_predict))
    test_f.append(f1_score(y_test, test_predict))
    
  x = list(range(split_size, len(x_train) + split_size, split_size))

  ax1.plot(x, train_acc, 'o-', color="b",  label='Training Accuracy')
  ax1.plot(x, test_acc, 'o-', color="red",label='Test Accuracy')
  ax1.legend(loc="lower right")
  
  ax2.plot(x, train_prec, 'o-', color="b",  label='Training Precision')
  ax2.plot(x, test_prec, 'o-', color="red",label='Test Precision')
  ax2.legend(loc="lower right")

  ax3.plot(x, train_rec, 'o-', color="b",  label='Training Recall')
  ax3.plot(x, test_rec, 'o-', color="red",label='Test Recall')
  ax3.legend(loc="lower right")

  ax4.plot(x, train_f, 'o-', color="b",  label='Training f1')
  ax4.plot(x, test_f, 'o-', color="red",label='Test f1')
  ax4.legend(loc="lower right")

  if zoom_out:
    ax1.axis(ymin= 0.0, ymax= 1.0)
    ax2.axis(ymin= 0.0, ymax= 1.0)
    ax3.axis(ymin= 0.0, ymax= 1.0)
    ax4.axis(ymin= 0.0, ymax= 1.0)
  plt.show()

  data = {
    'trainA' : train_acc,
    'testA' : test_acc,
    'trainP' : train_prec,
    'testP' : test_prec,
    'trainR' : train_rec,
    'testR' : test_rec,
    'trainF' : train_f,
    'testF' : test_f
  }

  return data

def compare_two_classification_algorithms(estimator1, estimator2, x_train, y_train, x_test, y_test, n_splits, title1, title2, zoom_out= False):
  split_size = int(len(x_train) / n_splits)
  x_splits = np.split(x_train, n_splits) # must be equal division
  y_splits = np.split(y_train, n_splits)

  train_acc_1, test_acc_1, train_prec_1, test_prec_1, train_rec_1, test_rec_1, train_f_1, test_f_1 = list(), list(), list(), list(), list(), list(), list(), list()

  train_acc_2, test_acc_2, train_prec_2, test_prec_2, train_rec_2, test_rec_2, train_f_2, test_f_2 = list(), list(), list(), list(), list(), list(), list(), list()

  fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 16), dpi=100, gridspec_kw={'width_ratios': [1, 1], 'height_ratios': [1, 1]})
  fig.suptitle("Plots Comparing {title1} vs {title2}".format(title1=title1, title2= title2), fontsize = 16)

  curr_x = x_splits[0]
  curr_y = y_splits[0]
  
  estimator1.fit(curr_x, curr_y)
  estimator2.fit(curr_x, curr_y)

  train_predict_1 = estimator1.predict(curr_x)
  test_predict_1 = estimator1.predict(x_test)

  train_predict_2 = estimator2.predict(curr_x)
  test_predict_2 = estimator2.predict(x_test)

  #estimator 1
  train_acc_1.append(accuracy_score(curr_y, train_predict_1))
  test_acc_1.append(accuracy_score(y_test, test_predict_1))

  train_prec_1.append(precision_score(curr_y, train_predict_1))
  test_prec_1.append(precision_score(y_test, test_predict_1))

  train_rec_1.append(recall_score(curr_y, train_predict_1))
  test_rec_1.append(recall_score(y_test, test_predict_1))

  train_f_1.append(f1_score(curr_y, train_predict_1))
  test_f_1.append(f1_score(y_test, test_predict_1))
  #estimator 2
  train_acc_2.append(accuracy_score(curr_y, train_predict_2))
  test_acc_2.append(accuracy_score(y_test, test_predict_2))

  train_prec_2.append(precision_score(curr_y, train_predict_2))
  test_prec_2.append(precision_score(y_test, test_predict_2))

  train_rec_2.append(recall_score(curr_y, train_predict_2))
  test_rec_2.append(recall_score(y_test, test_predict_2))

  train_f_2.append(f1_score(curr_y, train_predict_2))
  test_f_2.append(f1_score(y_test, test_predict_2))

  for i in range(1, len(x_splits)):
    print("Split training examples : " ,i)
    curr_x = np.concatenate((curr_x, x_splits[i]), axis=0)
    curr_y = np.concatenate((curr_y, y_splits[i]), axis=0)
    estimator1.fit(curr_x, curr_y)
    estimator2.fit(curr_x, curr_y)

    train_predict_1 = estimator1.predict(curr_x)
    test_predict_1 = estimator1.predict(x_test)

    train_predict_2 = estimator2.predict(curr_x)
    test_predict_2 = estimator2.predict(x_test)

    #estimator 1
    train_acc_1.append(accuracy_score(curr_y, train_predict_1))
    test_acc_1.append(accuracy_score(y_test, test_predict_1))

    train_prec_1.append(precision_score(curr_y, train_predict_1))
    test_prec_1.append(precision_score(y_test, test_predict_1))

    train_rec_1.append(recall_score(curr_y, train_predict_1))
    test_rec_1.append(recall_score(y_test, test_predict_1))

    train_f_1.append(f1_score(curr_y, train_predict_1))
    test_f_1.append(f1_score(y_test, test_predict_1))
    #estimator 2
    train_acc_2.append(accuracy_score(curr_y, train_predict_2))
    test_acc_2.append(accuracy_score(y_test, test_predict_2))

    train_prec_2.append(precision_score(curr_y, train_predict_2))
    test_prec_2.append(precision_score(y_test, test_predict_2))

    train_rec_2.append(recall_score(curr_y, train_predict_2))
    test_rec_2.append(recall_score(y_test, test_predict_2))

    train_f_2.append(f1_score(curr_y, train_predict_2))
    test_f_2.append(f1_score(y_test, test_predict_2))


  x = list(range(split_size, len(x_train) + split_size, split_size))

  ax1.plot(x, train_acc_1, 'o-', color="cyan",  label='Training Accuracy for {title1}'.format(title1= title1))
  ax1.plot(x, test_acc_1, 'o-', color="maroon",label='Test Accuracy for {title1}'.format(title1= title1))
  ax1.plot(x, train_acc_2, 'o-', color="gold",  label='Training Accuracy for {title2}'.format(title2= title2))
  ax1.plot(x, test_acc_2, 'o-', color="olive",label='Test Accuracy for {title2}'.format(title2= title2))
  ax1.legend(loc="lower right")
  
  ax2.plot(x, train_prec_1, 'o-', color="cyan",  label='Training Precision for {title1}'.format(title1= title1))
  ax2.plot(x, test_prec_1, 'o-', color="maroon",label='Test Precision for {title1}'.format(title1= title1))
  ax2.plot(x, train_prec_2, 'o-', color="gold",  label='Training Precision {title2}'.format(title2= title2))
  ax2.plot(x, test_prec_2, 'o-', color="olive",label='Test Precision {title2}'.format(title2= title2))
  ax2.legend(loc="lower right")

  ax3.plot(x, train_rec_1, 'o-', color="cyan",  label='Training Recall for {title1}'.format(title1= title1))
  ax3.plot(x, test_rec_1, 'o-', color="maroon",label='Test Recall for {title1}'.format(title1= title1))
  ax3.plot(x, train_rec_2, 'o-', color="gold",  label='Training Recall {title2}'.format(title2= title2))
  ax3.plot(x, test_rec_2, 'o-', color="olive",label='Test Recall {title2}'.format(title2= title2))
  ax3.legend(loc="lower right")

  ax4.plot(x, train_f_1, 'o-', color="cyan",  label='Training f1 {title1}'.format(title1= title1))
  ax4.plot(x, test_f_1, 'o-', color="maroon",label='Test f1 {title1}'.format(title1= title1))
  ax4.plot(x, train_f_2, 'o-', color="gold",  label='Training f1 {title2}'.format(title2= title2))
  ax4.plot(x, test_f_2, 'o-', color="olive",label='Test f1 {title2}'.format(title2= title2))
  ax4.legend(loc="lower right")

  if zoom_out:
    ax1.axis(ymin= 0.0, ymax= 1.0)
    ax2.axis(ymin= 0.0, ymax= 1.0)
    ax3.axis(ymin= 0.0, ymax= 1.0)
    ax4.axis(ymin= 0.0, ymax= 1.0)
  plt.show()

  data1 = {
    'trainA' : train_acc_1,
    'testA' : test_acc_1,
    'trainP' : train_prec_1,
    'testP' : test_prec_1,
    'trainR' : train_rec_1,
    'testR' : test_rec_1,
    'trainF' : train_f_1,
    'testF' : test_f_1
  }

  data2 = {
    'trainA' : train_acc_2,
    'testA' : test_acc_2,
    'trainP' : train_prec_2,
    'testP' : test_prec_2,
    'trainR' : train_rec_2,
    'testR' : test_rec_2,
    'trainF' : train_f_2,
    'testF' : test_f_2
  }

  return data1, data2

def loss_plot(his, kind):
  train = his.history[kind]
  val = his.history['val_' + kind]
  epochs = range(1, len(train)+1)
  plt.figure(figsize=(12,9))
  plt.plot(epochs, train, 'b', label='Training ' + kind)
  plt.plot(epochs, val, 'orange', label='Validation ' + kind)
  plt.title('Training and validation ' + kind) 
  plt.xlabel('Epochs')
  plt.ylabel(kind)
  plt.legend()
  plt.show()



## **`6.0 Tables in training and test examples`**
> #### After collecting the data from the custom curve or the custom_sklearn_curve, <br> it gives us a table with the parameters inside, but in more suitable manner

In [11]:
def pandas_tables(x_train_b, data, splits):
    data = {
    #'Id' : [i + 1 for i in range(len(train_accuracies))], 
    'Number of Examples' : list(range(int(len(x_train_b)/splits), len(x_train_b) + int(len(x_train_b)/splits), int(len(x_train_b)/splits))), 
    "Train Accuracy" : data['trainA'], 
    "Test Accuracy" : data['testA'],
    "Train Precision" : data['trainP'],
    "Test Precision" : data['testP'],
    "Train Recall" : data['trainR'],
    "Test Recall" : data['testR'],
    "Train F1" : data['trainF'],
    "Test F1" : data['testF']}
    df = pd.DataFrame(data)
    
    return df

In [None]:
estimator = Naive_Bayes()
data = custom_curve(estimator, np.concatenate((x_train_binary, x_val_binary), axis= 0), np.concatenate((y_train, y_val), axis= 0), x_test=x_test_binary, y_test=y_test, n_splits=5, title= 'Naive Bayes')
pandas_tables(x_test_binary, data, 5)

In [None]:
estimator = ID3(labels=set([x for x in range(len(x_train_binary[0]))]), max_depth=10, dc=0)
data = custom_curve(estimator, x_train=np.concatenate((x_train_binary, x_val_binary), axis= 0), y_train=np.concatenate((y_train, y_val), axis= 0), x_test=x_test_binary, y_test=y_test, n_splits=5, title= 'ID3', zoom_out= False)
pandas_tables(x_test_binary,data, splits=5)

In [None]:
estimator = Random_Forest(num_trees=3, m=500, max_depth= 5)
data = custom_curve(estimator, x_train=np.concatenate((x_train_binary, x_val_binary), axis= 0), y_train=np.concatenate((y_train, y_val), axis= 0), x_test=x_test_binary, y_test=y_test, n_splits=5, title= 'Random Forest')
pandas_tables(x_test_binary, data, 5)

In [None]:
nb = BernoulliNB()
estimator = Naive_Bayes()
data1, data2 = compare_two_classification_algorithms(estimator1=estimator, estimator2= nb, x_train= np.concatenate((x_train_binary, x_val_binary), axis= 0), y_train= np.concatenate((y_train, y_val), axis= 0), x_test= x_test_binary, y_test= y_test, n_splits= 5, title1= "Naive Bayes", title2= "BarnouliNB", zoom_out= False)

In [None]:
pandas_tables(x_train_binary, data1, splits=5)

In [None]:
pandas_tables(x_train_binary, data2, splits=5)

In [None]:
dt = DecisionTreeClassifier(max_depth=10)
estimator = ID3(labels=set([x for x in range(len(x_train_binary[0]))]), max_depth=10, dc=0)
data1, data2 = compare_two_classification_algorithms(estimator1=estimator, estimator2= dt, x_train= np.concatenate((x_train_binary, x_val_binary), axis= 0), y_train= np.concatenate((y_train, y_val), axis= 0), x_test= x_test_binary, y_test= y_test, n_splits= 5, title1= "ID3", title2= "Decision Tree Classifier", zoom_out= False)

In [None]:
pandas_tables(x_train_binary, data1, splits=5)

In [None]:
pandas_tables(x_train_binary, data2, splits=5)

In [None]:
rf = RandomForestClassifier(n_estimators=3, max_features=500, max_depth=5)
estimator = Random_Forest(num_trees=3, m=500, max_depth= 5)
data1, data2 = compare_two_classification_algorithms(estimator1=estimator, estimator2= rf, x_train= np.concatenate((x_train_binary, x_val_binary), axis= 0), y_train= np.concatenate((y_train, y_val), axis= 0), x_test= x_test_binary, y_test= y_test, n_splits= 5, title1= "Random Forest", title2= "Random Forest Classifier", zoom_out= True)
pandas_tables(x_train_binary, data, 5)

In [None]:
pandas_tables(x_train_binary, data1, splits=5)

In [None]:
pandas_tables(x_train_binary, data2, splits=5)

In [None]:
imdb_ = RNN(epochs=5, vocabulary= vocabulary)
data = custom_curve(estimator=imdb_, x_train=np.concatenate((x_train, x_val), axis= 0), y_train=np.concatenate((y_train, y_val), axis= 0), x_test=x_test, y_test=y_test, title="Rnn BigRu Curves", zoom_out=False, n_splits=5)
pandas_tables(x_train_binary, data, 5)

In [None]:
imdb_ = RNN(epochs=5, vocabulary=vocabulary)
imdb_bigru_history = imdb_.get_history(np.concatenate((x_train, x_val), axis= 0), np.concatenate((y_train, y_val), axis= 0))
loss_plot(imdb_bigru_history, 'loss')