In [None]:
'''
Author    : Ramakrishnan Radhakrishnan
Goal      : Simple EDA and Model training for MLops and then increase complexity 
            gradually as the production deployment versions increase.
References: Much of the work are either inspired or referred from the following.
Along with official tensor flow docs,
1) https://builtin.com/data-science/guide-logistic-regression-tensorflow-20
2) https://towardsdatascience.com/natural-language-processing-with-tensorflow-e0a701ef5cef?gi=33ea4b1c117c
'''

In [2]:
'''
Cloning the project to get source data.
In case of confidential data, it is better to upload from local / read from 
disk to work on it.
'''
!git clone https://github.com/Ramakrishnanr/wake_word_detection.git

Cloning into 'wake_word_detection'...
remote: Enumerating objects: 18, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 18 (delta 4), reused 9 (delta 1), pack-reused 0[K
Unpacking objects: 100% (18/18), done.


In [3]:
cd wake_word_detection/

/content/wake_word_detection


In [None]:
## data_setup branch is for data related analysis.
!git checkout data_setup

Branch 'data_setup' set up to track remote branch 'data_setup' from 'origin'.
Switched to a new branch 'data_setup'


In [52]:
# Data handling related imports
import pandas as pd
import numpy as np
import pickle

# ML domain related imports
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [37]:
class DataHandle():
  def __init__(self, path = None, batch_size = None, random_seed = None):
    if path is None:
      self.data_path = '/content/wake_word_detection/data/train.csv'
    else:
      self.data_path = path
    
    ''' 
    In future, for huge dataset batch size will play a significant role 
    unlike now.
    '''
    if batch_size is None:
      self.batch_size = 10
    else:
      self.batch_size = batch_size
    
    if random_seed is None:
      self.random_seed = 5000
    else:
      self.random_seed = random_seed

    self.utterances = 'word'
    self.labels = 'label'
    self.test_size = 0.20
    self.random_state = 101

    self.prefetch = 1

  def read_data(self):
    return pd.read_csv(self.data_path)

  def check_prior(self, df):
    label_series = df.label.value_counts()
    wake_count_normalised      = (label_series[0] / (label_series[0] + 
                                                     label_series[1]))
    non_wake_count_normalised  = (label_series[1] / (label_series[0] + 
                                                     label_series[1]))
    return wake_count_normalised, non_wake_count_normalised

  def disp_prior(self):
    print("*****Prior Distribution*****")
    print("Wake Word:", wake_normalised)
    print("Non Wake Word:", non_wake_normalised)

  def get_train_val(self, X, Y):
    return train_test_split(X, Y, test_size= self.test_size, 
                            random_state= self.random_state)

  def parallelize_train_data(self, words_to_int, labels):
    train_data = tf.data.Dataset.from_tensor_slices((words_to_int, labels))
    train_data = train_data.repeat().shuffle(self.random_seed).batch(self.batch_size).prefetch(self.prefetch)
    return train_data

  def get_X(self, df):
    X = tf.data.Dataset.from_tensor_slices(df['self.utterances'])

In [26]:
class Preprocess():
   def __init__(self, max_features = None, seq_length = None):
     if max_features is None:
       self.max_features = 100
     else:
       self.max_features = max_features

     if seq_length is None:
      self.seq_length = 25 ## For wake word, we don't need bigger. Can extend. 
     else:
      self.seq_length = seq_length

   def tokenize_utterances(self, x):
    tokenizer = Tokenizer(num_words = self.max_features, oov_token = "<OOV>")
    tokenizer.fit_on_texts(x)
    word_index = tokenizer.word_index
    sequences = tokenizer.texts_to_sequences(x)
    padded_seq = pad_sequences(sequences, padding = 'post', truncating='post')
    padded_seq = padded_seq.astype('float32')
    return word_index, padded_seq

In [54]:
class Models():
  def __init__(self, no_of_classes = None, no_of_features = None, 
               learn_rate = None, train_steps = None, disp_step = None,
               weights_path = None, bias_path = None):
    if no_of_classes is None:
      self.no_of_classes = 2
    else:
      self.no_of_classes = no_of_classes
    
    if no_of_features is None:
      self.no_of_features = 1
    else:
      self.no_of_features = no_of_features
    
    if learn_rate is None:
      self.learn_rate = 0.01
    else:
      self.learn_rate = learn_rate

    if train_steps is None:
      self.train_steps = 10
    else:
      self.train_steps = train_steps

    if disp_step is None:
      self.disp_step = 1
    else:
      self.disp_step = disp_step
    self.weights_file = 'weights.txt'
    self.bias_file = 'bias.txt'
    self.word_index = 'word_index.txt'
    
  def get_weights(self, words_to_int):
    print(np.shape(words_to_int)[1])
    return tf.Variable(tf.ones([(np.shape(words_to_int)[1]), self.no_of_classes])
    , name="weights")
    
  def get_bias(self):
    return tf.Variable(tf.zeros([self.no_of_classes]), name="bias")

  def log_regression(self, x, W, b):
    return tf.nn.softmax(tf.matmul(x, W) + b)

  def get_accuracy(self, y_pred, y_true):
    correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.cast(y_true, tf.int64))
    return tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

  def get_optimizer(self):
    # Scope of improvement: With Adam optimizers in future.
    return tf.optimizers.SGD(self.learn_rate)

  def run_optimization(self, x, y, W, b, optimizer):
    evaluation = Evaluation()
    with tf.GradientTape() as g:
        pred = self.log_regression(x, W, b)
        loss = evaluation.loss_fn(pred, y)

    gradients = g.gradient(loss, [W, b])
    optimizer.apply_gradients(zip(gradients, [W, b]))

  def run_training(self, train_data, W, b, optimizer):
    evaluation = Evaluation()
    for step, (batch_x, batch_y) in enumerate(train_data.take(self.train_steps)
    , 1):
      self.run_optimization(batch_x, batch_y, W, b, optimizer)
      if step % self.disp_step == 0:
        pred = self.log_regression(batch_x, W, b)
        loss = evaluation.loss_fn(pred, batch_y)
        acc = self.get_accuracy(pred, batch_y)
        print("Step: %i, Loss: %f, Accuracy: %f" % (step, loss, acc))
    return W, b

  def save_params(self, W, b, word_index):
    # For logistic regression, we use simple saving. 
    # For NN, we can use tf.train.checkpoint / tf.keras.Model.save_weights. 
    #np.savetxt(self.weights, W)
    #np.savetxt(self.bias, b)
    weights_file = open(self.weights_file, 'wb')
    pickle.dump(W, weights_file)
    
    bias_file = open(self.bias_file, 'wb')
    pickle.dump(b, bias_file)

    #np.savetxt(self.word_index, word_index) # To match the vocab in test data.
    print("Model params saved successfully")

In [8]:
class Evaluation():
  def __init__(self, loss_metric = None, no_of_classes = None):
    if loss_metric is None:
      self.loss_metric = 'cross entropy'
    else:
      self.loss_metric = loss_metric
    if no_of_classes is None:
      self.no_of_classes = 2
    else:
      self.no_of_classes = no_of_classes

  def loss_fn(self, y_pred, y_actual):
    if self.loss_metric == 'cross entropy':
      y_actual = tf.one_hot(y_actual, depth=self.no_of_classes)
      # To avoid log(0) error,
      y_pred = tf.clip_by_value(y_pred, 1e-9, 1.)
      return tf.reduce_mean(-tf.reduce_sum(y_actual * tf.math.log(y_pred)))
    else:
      print("Currently only cross entropy is being provided.")
    

In [55]:
## EDA 
data_handle = DataHandle()
source_df = data_handle.read_data()
wake_normalised, non_wake_normalised = data_handle.check_prior(source_df)
data_handle.disp_prior()

## Preprocessing
preprocess = Preprocess()
word_index, X = preprocess.tokenize_utterances(source_df['word'])
Y = source_df['label'].to_numpy()

## Data split
x_train, x_val, y_train, y_val = data_handle.get_train_val(X, Y)
''' 
Please note: x_train_prefetch's datastructure:
tensorflow.python.data.ops.dataset_ops.PrefetchDataset 
'''
x_train_prefetch = data_handle.parallelize_train_data(x_train, y_train)

## Training
models = Models()
initial_weights = models.get_weights(x_train)
initial_bias = models.get_bias()
iterator = iter(x_train_prefetch)
optimizer = models.get_optimizer()
trained_weights, trained_bias = models.run_training(x_train_prefetch, 
                                                    initial_weights, 
                                                    initial_bias, optimizer)

## Testing
y_pred = models.log_regression(x_val, trained_weights, trained_bias)
accuracy = models.get_accuracy(y_pred, y_val)
print("Accuracy: %f" % accuracy)

## Save model params
models.save_params(trained_weights, trained_bias, word_index)

*****Prior Distribution*****
Wake Word: 0.64
Non Wake Word: 0.36
6
Step: 1, Loss: 6.169082, Accuracy: 0.800000
Step: 2, Loss: 7.975520, Accuracy: 0.700000
Step: 3, Loss: 3.318807, Accuracy: 0.900000
Step: 4, Loss: 32.283810, Accuracy: 0.500000
Step: 5, Loss: 9.242602, Accuracy: 0.300000
Step: 6, Loss: 8.915598, Accuracy: 0.300000
Step: 7, Loss: 3.867907, Accuracy: 0.800000
Step: 8, Loss: 5.102798, Accuracy: 0.800000
Step: 9, Loss: 25.858589, Accuracy: 0.600000
Step: 10, Loss: 9.622684, Accuracy: 0.800000
Accuracy: 0.600000
Model params saved successfully
