# The Journey from Mathematics to Machine Learning

## Series 1: Linear algebra

### Episode 7: Word Embedding

- NLP (Jupyter Notebook)
    1. [Predicting IMDB Movie reviews using MLP](#1.-Predicting-IMDB-Movie-reviews-using-MLP)
    2. [Hidden layers arrange the inputs into n groups](#2.-Hidden-layers-arrange-the-inputs-into-n-groups)


    
    

## 1. Predicting IMDB Movie reviews using MLP

In [1]:
import numpy as np

### Data

In [2]:
with open('dataset/reviews.txt') as f:
    raw_reviews = f.readlines()
with open('dataset/labels.txt') as f:
    raw_labels = f.readlines()

In [3]:
raw_reviews[1]

'story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is turned into an insane  violent mob by the crazy chantings of it  s singers . unfortunately it stays absurd the whole time with no general narrative eventually making it just too off putting . even those from the era should be turned off . the cryptic dialogue would make shakespeare seem easy to a third grader . on a technical level it  s better than you might think with some good cinematography by future great vilmos zsigmond . future stars sally kirkland and frederic forrest can be seen briefly .  \n'

#### Creating an input vector

In [3]:
onehots = {}
onehots['goy'] = np.array([1, 0, 0, 0])
onehots['muuhai'] = np.array([0, 1, 0, 0])
onehots['kino'] = np.array([0, 0, 1, 0])
onehots['baina'] = np.array([0, 0, 0, 1])

In [4]:
sentence = ['goy', 'kino', 'baina']

In [5]:
x = np.array([0, 0, 0, 0])
for i in range(len(sentence)):
    x += onehots[sentence[i]]
x

array([1, 0, 1, 1])

In [13]:
tokens = []
for review in raw_reviews:
    review = set(review.split(' '))
    review.remove('')
    tokens.append(list(review))
len(tokens[1])

92

In [14]:
words = set()
for review in tokens:
    for word in review:
        words.add(word)
words = list(words)

In [15]:
len(words)

74074

In [16]:
word_to_index = {}
for i, word in enumerate(words):
    word_to_index[word] = i

In [17]:
len(word_to_index)

74074

In [18]:
len(tokens)

25000

In [19]:
input_dataset = np.zeros((len(tokens), len(words)))

In [20]:
for i, review in enumerate(tokens):
    for word in review:
        input_dataset[i, word_to_index[word]] = 1

In [27]:
word_to_index['of']

66451

In [28]:
input_dataset[1][66451]

1.0

In [22]:
input_dataset[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [29]:
target_dataset = np.array([])
for label in raw_labels:
    if label == 'positive\n':
        target_dataset = np.append(target_dataset, 1)
    else:
        target_dataset = np.append(target_dataset, 0)

In [30]:
target_dataset.shape

(25000,)

In [31]:
target_dataset = target_dataset.reshape(25000, 1)

In [32]:
train_dataset = input_dataset[:24000]
train_labels = target_dataset[:24000]

test_dataset = input_dataset[24000:]
test_labels = target_dataset[24000:]

In [33]:
train_dataset.shape

(24000, 74074)

### Network

#### Linear Layer

In [34]:
class Layer_Linear:
    """Representing a neural network layer"""
    
    def __init__(self, n_inputs, n_outputs):
        """Initlize weights and bias"""
        self.weights = 0.01 * np.random.randn(n_inputs, n_outputs)
        self.biases = np.zeros((1, n_outputs))
    
    def forward(self, inputs):
        """
        It multiplies the inputs by the weights 
        and then sums them, and then sums bias.
        """
        #To calculate gradient, remembering input values
        self.inputs = inputs
        #Calculate outputs' values
        self.output = np.dot(inputs, self.weights) + self.biases
    
    def backward(self, dvalues):
        """Gradient with respect to parameters and input"""
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        self.dresults = np.dot(dvalues, self.weights.T)

#### Activation functions

In [35]:
class Activation_ReLU:
    """ReLU activation"""
    
    def forward(self, inputs):
        """Forward pass"""
        
        #To calculate gradient, remembering input values
        self.inputs = inputs
        
        #Calculate outputs' values
        self.output = np.maximum(0, inputs)
        
    def backward(self, dvalues):
        """Backward pass"""
        
        self.dresults = self.inputs > 0
        self.dresults = self.dresults * dvalues

In [36]:
class Activation_Sigmoid:
    """Sigmoid activation"""
    
    def forward(self, inputs):
        """Forward pass"""
        
        #Calculate outputs' values
        self.output = 1 / (1 + np.exp(-inputs))
    
    def backward(self, dvalues):
        """Backward pass"""
        
        self.dresults = dvalues * (1 - self.output) * self.output

#### Loss function

In [37]:
class Loss_MSE():
    """MSE Loss function"""
    
    def forward(self, y_pred, y_true):
        """Forward pass"""     
        error = np.mean((y_pred - y_true) ** 2)
        return error
    
    def backward(self, y_pred, y_true):
        """Derivative of MSE with respect to preds"""
        
        #Number of samples
        samples = len(y_pred)
        
        #Number of output nodes
        outputs = len(y_pred[0])
        
        #Derivative of MSE
        self.dresults = 2 * (y_pred - y_true) / (outputs * samples)

#### Optimizer

In [38]:
class Optimizer_GD:
    """Gradient descent optimizer"""
    
    def __init__(self, alpha=1.):
        """Initialize hyperparameters"""
        self.alpha = alpha

    def update_parameters(self, layer):
        """Update parameters"""
        
        weights_delta = layer.dweights * self.alpha
        biases_delta = layer.dbiases * self.alpha
        
        #Update parameters
        layer.weights -= weights_delta
        layer.biases -= biases_delta

### Hyperparameter

In [39]:
max_epoch = 5
alpha = 1
batch_size = 128

### Initialize the model

In [40]:
layer1 = Layer_Linear(len(words), 100)
activation1 = Activation_ReLU()

layer2 = Layer_Linear(100, 1)
activation2 = Activation_Sigmoid()

#### Initlize optimizer and loss function

In [41]:
loss = Loss_MSE()
optimizer = Optimizer_GD(alpha)

### Training the model

In [42]:
train_steps = len(train_dataset) // batch_size
if train_steps * batch_size < len(train_dataset):
    train_steps += 1

In [43]:
for epoch in range(max_epoch):
    train_error = 0
    train_accuracy = 0
    
    for i in range(train_steps):
        batch_start = i * batch_size
        batch_end = (i+1) * batch_size
        
        input = train_dataset[batch_start:batch_end]
        true = train_labels[batch_start:batch_end]
        
        #Forward pass
        layer1.forward(input)
        activation1.forward(layer1.output)
        layer2.forward(activation1.output)
        activation2.forward(layer2.output)
        train_error += loss.forward(activation2.output, true) / train_steps
        train_accuracy += np.mean((np.abs(activation2.output - true) < 0.5)) / train_steps
        
        #Backward pass
        loss.backward(activation2.output, true)
        activation2.backward(loss.dresults)
        layer2.backward(activation2.dresults)
        activation1.backward(layer2.dresults)
        layer1.backward(activation1.dresults)
        
        #Update parameters
        optimizer.update_parameters(layer2)
        optimizer.update_parameters(layer1)

    print(f'epoch: {epoch},',
          f'Train error: {train_error:.3f},',
          f'Train accuracy: {train_accuracy:.3f}')

epoch: 0, Train error: 0.169, Train accuracy: 0.750
epoch: 1, Train error: 0.102, Train accuracy: 0.857
epoch: 2, Train error: 0.088, Train accuracy: 0.880
epoch: 3, Train error: 0.073, Train accuracy: 0.901
epoch: 4, Train error: 0.067, Train accuracy: 0.912


#### Testing the model

In [44]:
test_steps = len(test_dataset) // batch_size
if test_steps * batch_size < len(test_dataset):
    test_steps += 1

In [45]:
test_error = 0
test_accuracy = 0

for i in range(test_steps):
    batch_start = i * batch_size
    batch_end = (i+1) * batch_size
    
    input = test_dataset[batch_start:batch_end]
    true = test_labels[batch_start:batch_end]
    
    layer1.forward(input)
    activation1.forward(layer1.output)
    layer2.forward(activation1.output)
    activation2.forward(layer2.output)
    test_error += loss.forward(activation2.output, true) / test_steps
    test_accuracy += np.mean((np.abs(activation2.output - true) < 0.5)) / test_steps

In [46]:
print(f'Test error: {test_error:.3f},',
      f'Test accuracy: {test_accuracy:.3f}')

Test error: 0.104, Test accuracy: 0.856


## 2. Hidden layers arrange the inputs into n groups

In [47]:
from collections import Counter

In [48]:
def similar(target):
    target_index = word_to_index[target]
    scores = Counter()
    for word, index in word_to_index.items():
        # Finding Euclidian distance
        scores[word] = -np.linalg.norm(layer1.weights[index] - layer1.weights[target_index])
    
    return scores.most_common(10)

In [59]:
similar('beautiful')

[('beautiful', -0.0),
 ('simple', -0.15756997816719837),
 ('rare', -0.15994443117106263),
 ('incredible', -0.16189580683292984),
 ('gem', -0.162012927740108),
 ('outstanding', -0.16350230494142096),
 ('fascinating', -0.1655773494339012),
 ('wonderfully', -0.16764800929627918),
 ('perfectly', -0.17070829170698965),
 ('atmosphere', -0.17081418194987413)]

In [51]:
word_to_index['awful']

49117

In [52]:
layer1.weights[49117]

array([ 5.25710747e-03, -1.14596470e-01,  2.24749474e-01, -1.68935070e-02,
       -2.42389878e-01, -1.91099164e-02,  2.29718264e-02, -1.98708680e-02,
        9.43995150e-02, -6.64210755e-03, -1.95678445e-02,  4.22047664e-02,
       -1.42102439e-03,  5.36124642e-03, -2.44279500e-02, -3.56612759e-02,
        2.53704457e-01, -1.55112412e-02, -1.50616782e-02,  6.65445547e-02,
        1.10166254e-01, -9.34819122e-02,  5.40347562e-04,  1.06614235e-02,
        2.01549685e-02, -1.65747549e-02,  7.70526567e-03,  4.94103107e-03,
       -4.13244380e-02,  2.06007513e-01,  1.78370370e-02, -1.63409992e-02,
        1.98568766e-02,  9.89469855e-05,  1.04508375e-01,  2.58538966e-01,
        3.59075505e-02, -1.38368283e-01, -1.01778427e-01, -1.94255591e-01,
        2.43674410e-04, -1.30854354e-01, -6.18705622e-02,  6.47655404e-02,
       -5.96987364e-03, -2.74385033e-01, -1.16894042e-01,  1.68015554e-03,
        9.57189529e-02, -2.01779319e-01, -6.05386545e-03, -6.56515929e-03,
       -1.27046693e-04,  