In [1]:
import torch

## Building blocks of a neural network

In [262]:
class Module(object):
    """
    abstract class used for our layers
    """

    def forward(self, *input):
        raise NotImplementedError

    def backward(self, *gradwrtoutput):
        raise NotImplementedError

    def param(self):
        return []

# Tanh

### Our function

In [20]:
a = torch.FloatTensor([2.0]) 

def tanh(x: torch.FloatTensor):
    
    numerator = torch.exp(x) - torch.exp(-x)
    denominator = torch.exp(x) + torch.exp(-x)
    
    return numerator/denominator

# Derivative
def tanh_p(x: torch.FloatTensor):
    
    return (1 - torch.pow(tanh(x),2))

In [3]:
print(tanh(a))
print(tanh_p(a))

tensor([0.9640])
tensor([0.0707])


### Torch function

In [4]:
b = torch.FloatTensor([2.0]) 
b.requires_grad_(True)

y = torch.tanh(b)
print(y)

y.backward()
print(b.grad)

tensor([0.9640], grad_fn=<TanhBackward>)
tensor([0.0707])


# Relu

### Our function

In [21]:
def relu(x: torch.FloatTensor):
    return torch.clamp(x,min =0)

def relu_p(x: torch.FloatTensor):
    
    x[x>0] = 1
    x[x<=0] = 0
    
    return x

In [28]:
a = torch.FloatTensor([2.0]) 
print(relu(a))
print(relu_p(a))

tensor([2.])
tensor([1.])


### Torch function

In [29]:
b = torch.FloatTensor([2.0]) 
b.requires_grad_(True)

y = torch.relu(b)
print(y)

y.backward()
print(b.grad)

tensor([2.], grad_fn=<ReluBackward0>)
tensor([1.])


# Sigmoid function

### Our function

In [34]:
def sigmoid(x: torch.FloatTensor):
    return (1 / (1 + torch.exp(-x)))

def sigmoid_p(x: torch.FloatTensor):

    return (sigmoid(x)*(1 - sigmoid(x)))

In [35]:
a = torch.FloatTensor([2.0]) 
print(sigmoid(a))
print(sigmoid_p(a))

tensor([0.8808])
tensor([0.1050])


In [36]:
b = torch.FloatTensor([2.0]) 
b.requires_grad_(True)

y = torch.sigmoid(b)
print(y)

y.backward()
print(b.grad)

tensor([0.8808], grad_fn=<SigmoidBackward>)
tensor([0.1050])


# LeakyRelu Function

In [37]:
torch.clamp(torch.FloatTensor([1.0,2.0,3.0]), max = 0)

tensor([0., 0., 0.])

In [38]:
def LRelu(x: torch.FloatTensor, slope: float):
    return torch.clamp(x, min = 0) + slope*torch.clamp(x, max=0)

def LRelu_p(x: torch.FloatTensor, slope: float):
    
    x[x>0] = 1
    x[x<=0] = slope
    
    return x
    

In [50]:
a = torch.FloatTensor([2.0]) 
print(LRelu(a, slope = 0.01))
print(LRelu_p(a, slope = 0.01))

tensor([2.])
tensor([1.])


# Mean Squared Error

In [55]:
output = torch.FloatTensor([1.0,1.0,1.0,0.0,0.0,0.0])
target = torch.FloatTensor([1.0,1.0,0.0,1.0,0.0,0.0])

In [56]:
def mse(predicted_output: torch.FloatTensor, target_output: torch.FloatTensor):
    
    return torch.pow(predicted_output - target_output, 2).sum()

def mse_p(predicted_output: torch.FloatTensor, target_output: torch.FloatTensor):
    return 2*(predicted_output - target_output)

In [12]:
import torch

In [None]:
torch.FloatTensor

In [None]:
torch.clamp()

In [22]:
torch.FloatTensor([1.0,2.0,3.0]).shape[0]

3

In [24]:
def cross_entropy(predictions, targets, epsilon=1e-12):
    """
    Computes cross entropy between targets (encoded as one-hot vectors)
    and predictions. 
    Input: predictions (N, k) ndarray
           targets (N, k) ndarray        
    Returns: scalar
    """
    predictions = np.clip(predictions, epsilon, 1. - epsilon)
    N = predictions.shape[0]
    ce = -np.sum(targets*np.log(predictions+1e-9))/N
    return ce

In [140]:
np.clip([1,0,0,0], 0.0000001, 0.9999999)

array([9.999999e-01, 1.000000e-07, 1.000000e-07, 1.000000e-07])

In [198]:
target.unsqueeze(1)

tensor([[1],
        [0],
        [0]])

In [None]:
class SGD:
    """
    Stochastic Gradient Descent optimizer
    """

    def __init__(self, model_params, lr=0.01):
        """
        saves the learning rate  and all the parameters
        and gradient accumulators of the network to optimize
        """
        self.model_params = model_params
        self.lr = lr

    def step(self):
        """
        updates all the parameters of the models
        using the respective gradient accumulator
        """
        for layers_params in self.model_params:
            for param_update in layers_params:
                param = param_update[0]
                update = param_update[1]

                # updating the parameter
                param -= self.lr * update

                # initialize to zero the accumulator
                update.zero_()

In [None]:
class StochasticGD:
    
    def __init__(self, parameters_list, learning_rate = 0.01):
        
        self.parameters_list = parameters_list
        self.learning_rate = learning_rate
        
    def update_parameters(self):
        
        raise NotImplementedError

In [235]:
def cross_entropy_torch(predictions, targets, epsilon=1e-20):
    """
    Computes cross entropy between targets (encoded as one-hot vectors)
    and predictions. 
    Input: predictions (N, #ofclasses) FloatTensor
           targets (N, 1) LongTensor        
    Returns: scalar
    """
    # Clamping the predictions between epsilon and 1 - epsilon
    predictions_clamped = predictions.clamp(epsilon, 1-epsilon)
    # Obtaining the probabilities of the target class
    to_compute_loss = predictions_clamped.gather(1, targets.unsqueeze(1))
    
    # Computing the loss
    log_loss = -torch.log(to_compute_loss)
    
    # Computing the mean of the loss
    average_ce_loss = torch.mean(log_loss)
    
    return average_ce_loss


def cross_entropy_troch_p(predictions, targets, epsilon=1e-20):
    
    # Number of examples
    numb_ex = targets.shape[0]
    
    # Clamping the predictions between epsilon and 1 - epsilon
    predictions_clamped = predictions.clamp(epsilon, 1-epsilon)
    
    # Computing derivative
    
    predictions_clamped[range(predictions_clamped.shape[0]), targets] -= 1
    
    derivative = predictions_clamped / numb_ex
    
    return derivative

In [426]:
class DataGenerator:
    
    def __init__(self, number_of_examples, number_of_features, batch_size = 2, shuffle= True):
        
        self.number_of_examples = number_of_examples
        self.number_of_features = number_of_features
        self.batch_size = batch_size
        self.shuffle = shuffle
        
        self.radius_circle = 1/math.sqrt(2*math.pi)
        self.center_circle = [0.5,0.5]
    
    def check_target(self, input_example):


        if math.pow(input_example[0] - self.center_circle[0], 2)\
            + math.pow(input_example[1] - self.center_circle[1], 2)\
                < math.pow(self.radius_circle,2):
            return 1
        else:
            return 0


    def generate_data(self):

        self.data = torch.FloatTensor(self.number_of_examples, self.number_of_features).uniform_(0,1)

        self.targets = torch.LongTensor(self.number_of_examples)
        index_targets = torch.arange(0, self.number_of_examples)

        self.targets = index_targets.apply_(lambda i: check_target(self.data[i]))

        return self.data, self.targets    
    
    def yield_data(self):
        
        if self.shuffle==True:
            
            shuffled_indexes = torch.randperm(self.number_of_examples)
            self.data_shuffled = self.data[shuffled_indexes]
            self.targets_shuffled = self.targets[shuffled_indexes]
            
            for batch_start in range(0, self.number_of_examples, self.batch_size):
                
                if self.number_of_examples - batch_start >= self.batch_size:
                    yield self.data_shuffled.narrow(0, batch_start, self.batch_size),\
                          self.targets_shuffled.narrow(0, batch_start, self.batch_size)
                else:
                    yield self.data_shuffled.narrow(0, batch_start, self.number_of_examples - batch_start),\
                          self.targets_shuffled.narrow(0, batch_start, self.number_of_examples - batch_start)
        
        else:
            
            for batch_start in range(0, self.number_of_examples, self.batch_size):
                
                if self.number_of_examples - batch_start >= self.batch_size:
                    yield self.data.narrow(0, batch_start, self.batch_size),\
                          self.targets.narrow(0, batch_start, self.batch_size)
                else:
                    yield self.data.narrow(0, batch_start, self.number_of_examples - batch_start),\
                          self.targets.narrow(0, batch_start, self.number_of_examples - batch_start)
        

In [None]:
def compute_loss_and_accuracy(self, is_training=True):

        epoch_losses = []
        total = 0
        correct = 0

        loader = self.train_loader if is_training else self.test_loader

        for j, (x_batch, y_batch) in enumerate(loader.get_loader()):

            batch_loss = 0
            for x, y in zip(x_batch, y_batch):
                predicted = self.model.forward(x)
                total += 1
                if predicted.max(0)[1] == y.max(0)[1]:
                    correct += 1
                loss_value = self.loss.compute(predicted, y)
                batch_loss += loss_value
                epoch_losses.append(loss_value)

                # computing the backward pass for the training part
                if is_training:
                    derivative_loss = self.loss.derivative(predicted, y)
                    self.model.backward(derivative_loss)

            # updating the model weights during training at the end of the batch
            if is_training:
                self.optimizer.step()

        epoch_val_loss = sum(epoch_losses) / len(epoch_losses)
        return epoch_val_loss, correct/total

In [416]:
train_data = DataGenerator(number_of_examples=5, number_of_features=2)
training_data, training_target = train_data.generate_data()

In [34]:
def compute(predicted, target):
        target = target.view(-1,1).max(0)[1]
        predicted = predicted.view(1, -1)

        # computing the negative log likelihood
        log_likelihood = -torch.log(predicted[range(predicted.shape[0]), target])
        loss = torch.sum(log_likelihood) / predicted.shape[0]
        return loss

In [26]:
import numpy as np

In [27]:
predictions = np.array([[0.25,0.25,0.25,0.25],
                        [0.01,0.01,0.01,0.96]])
targets = np.array([[0,0,0,1],
                   [0,0,0,1]])
ans = 0.71355817782  #Correct answer
x = cross_entropy(predictions, targets)
print(np.isclose(x,ans))

True


In [30]:
predictions_t = torch.FloatTensor([[0.25,0.25,0.25,0.25],
                        [0.01,0.01,0.01,0.96]])

targets_t = torch.FloatTensor([[0,0,0,1],
                   [0,0,0,1]])

In [31]:
x_t = cross_entropy_torch(predictions_t,targets_t)

In [32]:
x_t

tensor(0.7136)

In [33]:
x

0.7135581752992395

In [15]:
torch.pow(torch.FloatTensor([1,2,4]) - torch.FloatTensor([0,1,2]), 2).mean() / 2

tensor(1.)

In [19]:
len(torch.FloatTensor([1,2,4]))

3

In [None]:
class Mse:
    
    def mse(predicted_output: torch.FloatTensor, target_output: torch.FloatTensor):
    
        return torch.pow(predicted_output - target_output, 2).sum()

    def mse_p(predicted_output: torch.FloatTensor, target_output: torch.FloatTensor):
        return 2*(predicted_output - target_output)

# Linear fully connected layer

In [69]:
mat = torch.randn(2, 3)
vec = torch.randn(3)
torch.mv(mat, vec)

tensor([-0.0662,  4.4729])

In [None]:
 (0.20, 0.40, 0.30, 0.10). Then you roll the dice many thousands of times and determine that the true probabilities are (0.15, 0.35, 0.25, 0.25). The CE error for your prediction is:

In [130]:
T_data = [[[1., 2.], [3., 4.]],
          [[5., 6.], [7., 8.]],
          [[9.0, 10.0], [11.0, 12.0]]]
T = torch.tensor(T_data)

In [139]:
T.view(-1,1)

tensor([[ 1.],
        [ 2.],
        [ 3.],
        [ 4.],
        [ 5.],
        [ 6.],
        [ 7.],
        [ 8.],
        [ 9.],
        [10.],
        [11.],
        [12.]])

In [112]:
a = torch.FloatTensor([[[0.0,1.0]],[]])
b = torch.FloatTensor([[0.15,0.35,0.25,0.25],[0.4,0.2,0.2,0.2]])

In [116]:
import torch.nn.functional as F

In [121]:
input = torch.randn(3, 5, requires_grad=True)
target = torch.randint(5, (3,), dtype=torch.int64)
loss = F.cross_entropy(input, target)
#loss.backward()

In [None]:
-torch.log(predicted[range(predicted.shape[0]), target])

In [94]:
class Relu(Module):
    
    def forward(self, *input):
        
        self.output = input[0]
        
        return self.relu(self.output)
    
    
    def backward(self, *gradwrtoutput):
        
        derivatives = self.relu_p(self.output)
        
        return derivatives * gradwrtoutput[0]
    
    
    def relu(self, x):
        
        return torch.clamp(x, min =0)
    
    def relu_p(self, x):

        x[x>0] = 1
        x[x<=0] = 0

        return x


In [None]:
class Tanh(Module):
    
    def forward(self, *input):
        
        self.output = input[0]
        
        return self.tanh(self.output)
    
    
    def backward(self, *gradwrtoutput):
        
        derivatives = self.tanh_p(self.output)
        
        return derivatives * gradwrtoutput[0]
    
    def tanh(self, to_compute):
        
        numerator = torch.exp(to_compute) - torch.exp(-to_compute)
        denominator = torch.exp(to_compute) + torch.exp(-to_compute)
        
        return numerator/denominator
        
        
    def tanh_p(self, x):

        return (1 - torch.pow(self.tanh(x),2))


In [None]:
class Linear(Module):
    
    def __init__(self, in_features, out_features):
        
        # Number of input neurons
        self.in_features = in_features
        # Number of output neurons
        self.out_features = out_features
        
        # Initializing the weights with Xavier’s initialization
        # First generate the weights from a normal distribution with mean 0 and std 1
        # Then multiply the samples by sqrt(1 / (number_of_input_neurons + number_of_output_neurons))
        self.weight = torch.mul(torch.Tensor(out_features, in_features).normal_(mean=0, std=1), \ 
                                torch.sqrt(torch.FloatTensor([1/ (self.in_features + self.out_features)])))
        
        # Zero bias initialization
        self.bias = torch.Tensor(out_features).zero_()

        
        
    def forward(self, *input):
        
        # Input from the layer
        self.input_from_layer = input[0]
        
        # Calculating the output, which is basically the multiplication
        # of the weights with the input layer and adding the bias
        self.output = torch.mv(weights, self.input_from_layer) + self.bias
        
        
        return self.output

    def backward(self, *gradwrtoutput):
        raise NotImplementedError

    def param(self):
        return []
    

In [5]:
a=[1,2,3]

In [None]:
class Sequential:
    
    def __init__(self, layers):
        
        self.layers = layers
        
    def forward(self, initial_input):
        
        # initial input
        output_single_layer = initial_input
        
        # Iterate through the layers of the network
        # Pass the input to the forward function of the first layer
        # and keep iterating over the layers and passing the output
        # of the layer before to the one after
        for layer in self.layers:
            output_single_layer = layer.forward(output_single_layer)
        
        # The last output of the network
        return output_single_layer
    
    def backward(self, initial_backward_input):
        
        # Starting with the derivative of the loss
        # We backpropagate by calling the backward
        # function of each layer 
        output_single_layer_backward = initial_backward_input
        
        for layer in self.layers[::-1]:
            
            output_single_layer_backward = layer.backward(output_single_layer_backward)
            
        return output_single_layer_backward
    
    def param(self):
        
        parameters_of_each_layer = []
        for layer in self.layers:
            
            parameters_of_each_layer.append(layer.param)
            
        return parameters_of_each_layer
            

In [291]:
class Softmax(Module):
    
    def forward(self, *input):
        
        self.input_from_layer = input[0]
        
        return self.softmax(input_from_layer)
    
    def backward(self, *gradwrtoutput):
        
        derivatives = self.softmax_p(self.input_from_layer)
        
        return derivatives * gradwrtoutput[0]
        
    
    def softmax(self, input_to_compute):
        
        input_to_compute_v = input_to_compute.view(-1,1)
        
        norm_value = input_to_compute_v.max()
        
        stable_input_to_compute_v = input_to_compute_v - norm_value
        
        exponentials = torch.exp(stable_input_to_compute_v)
        
        sum_exponentials = torch.sum(exponentials)
        
        return (exponentials/sum_exponentials).view(-1)
    
    def softmax_p(self, input_to_compute_p):
        
        softmax_res = self.softmax(input_to_compute_p)
        
        diag_softm = torch.diag(softmax_res)
        
        derivative_soft = torch.FloatTensor(diag_softm.shape[0], diag_softm.shape[0])
        
        for i in range((diag_softm.shape[0])):
            for j in range((diag_softm.shape[0])):
                if i == j:
                    derivative_soft[i][j] = softmax_res[i] * (1 - softmax_res[i])
                else:
                    derivative_soft[i][j] = -softmax_res[i] * softmax_res[j]
                    
        return derivative_soft
    

In [None]:
def compute_loss_and_accuracy(self, is_training=True):

    epoch_losses = []
    total = 0
    correct = 0

    loader = self.train_loader if is_training else self.test_loader

    for j, (x_batch, y_batch) in enumerate(loader.get_loader()):

        batch_loss = 0
        for x, y in zip(x_batch, y_batch):
            predicted = self.model.forward(x)
            total += 1
            if predicted.max(0)[1] == y.max(0)[1]:
                correct += 1
            loss_value = self.loss.compute(predicted, y)
            batch_loss += loss_value
            epoch_losses.append(loss_value)

            # computing the backward pass for the training part
            if is_training:
                derivative_loss = self.loss.derivative(predicted, y)
                self.model.backward(derivative_loss)

        # updating the model weights during training at the end of the batch
        if is_training:
            self.optimizer.step()

    epoch_val_loss = sum(epoch_losses) / len(epoch_losses)
    return epoch_val_loss, correct/total

In [None]:
class Linear(Module):
    r"""Applies a linear transformation to the incoming data: :math:`y = xA^T + b`

    Args:
        in_features: size of each input sample
        out_features: size of each output sample
        bias: If set to False, the layer will not learn an additive bias.
            Default: ``True``

    Shape:
        - Input: :math:`(N, *, \text{in\_features})` where :math:`*` means any number of
          additional dimensions
        - Output: :math:`(N, *, \text{out\_features})` where all but the last dimension
          are the same shape as the input.

    Attributes:
        weight: the learnable weights of the module of shape
            :math:`(\text{out\_features}, \text{in\_features})`. The values are
            initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
            :math:`k = \frac{1}{\text{in\_features}}`
        bias:   the learnable bias of the module of shape :math:`(\text{out\_features})`.
                If :attr:`bias` is ``True``, the values are initialized from
                :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
                :math:`k = \frac{1}{\text{in\_features}}`

    Examples::

        >>> m = nn.Linear(20, 30)
        >>> input = torch.randn(128, 20)
        >>> output = m(input)
        >>> print(output.size())
        torch.Size([128, 30])
    """
    __constants__ = ['bias']

    def __init__(self, in_features, out_features, bias=True):
        super(Linear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.Tensor(out_features, in_features))
        if bias:
            self.bias = Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in)
            init.uniform_(self.bias, -bound, bound)

    def forward(self, input):
        return F.linear(input, self.weight, self.bias)

    def extra_repr(self):
        return 'in_features={}, out_features={}, bias={}'.format(
            self.in_features, self.out_features, self.bias is not None
        )