In [1]:
import torch
from torch.version import cuda

In [3]:
randint = torch.randint(-100, 100, (6,))
randint

tensor([ -31,   61,   35,  -91,  -23, -100])

In [5]:
tensor = torch.tensor(([0.1,0.2], [0.3,0.4], [0.5,0.6]))
tensor

tensor([[0.1000, 0.2000],
        [0.3000, 0.4000],
        [0.5000, 0.6000]])

In [7]:
 zeros = torch.zeros(2,3)
 zeros

tensor([[0., 0., 0.],
        [0., 0., 0.]])

In [8]:
ones = torch.ones(3,4)
ones

tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])

In [21]:
input = torch.empty(2,3)
input

tensor([[0., 0., 0.],
        [0., 0., 0.]])

In [23]:
arange = torch.arange(5)
arange

tensor([0, 1, 2, 3, 4])

In [26]:
linspace = torch.linspace(3,10,steps=5)
linspace

tensor([ 3.0000,  4.7500,  6.5000,  8.2500, 10.0000])

In [27]:
logspace =  torch.logspace(start=-10,end=10,steps=5)
logspace

tensor([1.0000e-10, 1.0000e-05, 1.0000e+00, 1.0000e+05, 1.0000e+10])

In [28]:
eye=torch.eye(5)
eye

tensor([[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1.]])

In [29]:
a = torch.empty((2,3),dtype=torch.int64)
empty_like = torch.empty_like(a)
empty_like

tensor([[0, 0, 0],
        [0, 0, 0]])

In [34]:
'''Cuda's alternative in MAC is Metal Performance Shaders (MPS) from the library Metal, which can leverage in-built MAC GPUs instead of using an external GPU or Cuda'''

device = 'mps' if torch.backends.mps.is_available() else 'cpu'
device

'mps'

In [39]:
import time
import numpy as np

start_time = time.time()
zeros = torch.zeros(2,3)
end_time = time.time()

elapsed_time = end_time-start_time
print(f"elapsed time:{elapsed_time:.8f}")

elapsed time:0.00074697


In [42]:
torch_rand_1 = torch.rand(100,100, 100, 100).to(device)
torch_rand_2 = torch.rand(100,100, 100, 100).to(device)
numpy_rand_1 = torch.rand(100,100, 100, 100)
numpy_rand_2 = torch.rand(100,100,100,100)

start_time = time.time()
#multiply tensors in torch
rand = torch_rand_1 @ torch_rand_2
end_time = time.time()
elapsed_time = end_time - start_time
print(f"elapsed time:{elapsed_time:.8f}")


start_time = time.time()
rand = np.multiply(numpy_rand_1, numpy_rand_2)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"elapsed time:{elapsed_time:.8f}")

elapsed time:0.03003979
elapsed time:0.27669907


In [8]:
import torch

probabilities = torch.tensor([0.1,0.9])
#draw 5 samples from multinomial distribution

samples = torch.multinomial(probabilities, num_samples=10, replacement=True)
samples

tensor([0, 1, 1, 1, 1, 1, 1, 1, 0, 1])

In [10]:
tensor = torch.tensor([1,2,3,4])
new_tensor = torch.cat((tensor, torch.tensor([5])),dim=0)
new_tensor

tensor([1, 2, 3, 4, 5])

In [11]:
out = torch.tril(torch.ones(5,5))
out
'''
Notice how every row builds up on the number of 1s. We can assume these 1s to be the history of the tokens predicted.
This way, as we build up on predictions, we have more history to refer to.
'''

tensor([[1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1.]])

In [12]:
out = torch.triu(torch.ones(5,5))
out

tensor([[1., 1., 1., 1., 1.],
        [0., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1.],
        [0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 1.]])

In [15]:
#when you exponentiate this...
out = torch.zeros(5,5).masked_fill(torch.tril(torch.ones(5,5))==0, float('-inf'))
print(out)
torch.exp(out)

tensor([[0., -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0.]])


tensor([[1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1.]])

In [19]:
input = torch.zeros(2,3,4)
print(input.shape)
print(input)
#swapping zeroth with the second entry
output = input.transpose(0,2)
print(output.shape)
print(output)

torch.Size([2, 3, 4])
tensor([[[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]])
torch.Size([4, 3, 2])
tensor([[[0., 0.],
         [0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.]]])


In [20]:
tensor1 = torch.tensor([1,2,3])
tensor2 = torch.tensor([4,5,6])
tensor3 = torch.tensor([7,8,9])
#stack tensors along a new dimension - useful for batchsize hyperparameter
stacked_tensor = torch.stack([tensor1, tensor2, tensor3])
stacked_tensor

tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])

In [27]:
'''
 torch.nn is a module which has all the utilities and parameters that can be used to build and train a neural network more efficiently. Ex. Linear, Activations, Layers
'''

import torch.nn as nn
sample = torch.tensor([10.,10.,10.])
linear = nn.Linear(3,3,bias=False)
print(linear(sample))

'''
Linear layer linearly transforms the inputs using randomly initialized weights.

Y = X*(W^T) + Bias
So it initializes the weights randomly, and performs matrix multiplication with the input features, resulting in the defined output size transformation.

We can set our own weights using
linear.weights.data = torch.tensor(
    [0.5, 0.1, -0.3],
    [0.2, 0.4,  0.7],
    [-0.6, 0.8, 0.2]
)

why do we need to use a linear transformation of input?

So we are basically looking to train our network to learn the best possible correlations between the
importance of each input feature contributing to the output prediction.

So Linear transformation is a way to scale, and mix up the input data in different ways, until we know the best way to utilize these inputs to predict the output.
So the weights can be considered as the importance we give for each input feature.
The dimensions of the weights matrix is generally [output_features, input_features] and the bias, if mentioned, is [output_features].

During the training, we come up with the best weights to accurately determine the outputs.
'''


tensor([10.3766,  2.4042, 11.1491], grad_fn=<SqueezeBackward4>)


'\nLinear layer linearly transforms the inputs using randomly initialized weights.\n\nY = X*(W^T) + Bias\nSo it initializes the weights randomly, and performs matrix multiplication with the input features, resulting in the defined output size transformation.\n\nWe can set our own weights using\nlinear.weights.data = torch.tensor(\n    [0.5, 0.1, -0.3],\n    [0.2, 0.4,  0.7],\n    [-0.6, 0.8, 0.2]\n)\n\nwhy do we need to use a linear transformation of input?\n\nSo we are basically looking to train our network to learn the best possible correlations between the\nimportance of each input feature contributing to the output prediction.\n\nSo Linear transformation is a way to scale, and mix up the input data in different ways, until we know the best way to utilize these inputs to predict the output.\nSo the weights can be considered as the importance we give for each input feature.\nThe dimensions of the weights matrix is generally [output_features, input_features] and the bias, if mentioned

In [28]:
 '''
 Softmax -

 if we apply softmax to [1,2,3] -
 exp_sum = exp(1)+exp(2)+exp(3)
 [exp(1)/exp_sum, exp(2)/exp_sum, exp(3)/exp_sum]

 Softmax basically converts a vector of raw scores (logits) into probabilities (as explained above).
 These are generally the probabilities associated with each class label.
 '''
 import torch.nn.functional as F
 tensor1 = torch.tensor([1.,2.,3.])
 softmax_out = F.softmax(tensor1, dim=0)
 softmax_out


tensor([0.0900, 0.2447, 0.6652])

In [34]:
'''
nn.Embedding layer - Embedding is a very important concept in NLP, where you associate a word, token or a character to a list of real-valued numbers as a vector representation of that word. This allows us to identify the semantic relations of these words/tokens/charactes in the vector space, based on the closeness.
Imagine another dimension, where you map all these words with their associated vector representations. That dimension is where we have these "vector embeddings" of words.

Since a computer can't technically understand natural language, embedding these words into the vector space using real-valued numbers allows them to identify the semantic connections of these words. This helps in performing various NLP tasks such as dialogue generation, sequence-to-sequence predictions.
'''
vocab_size = 10
embedding_dim = 2
embedding = nn.Embedding(vocab_size, embedding_dim)
input_indices = torch.LongTensor([1,5])
embedded_output = embedding(input_indices)
print(embedded_output)
print(embedded_output.shape)

tensor([[-0.9976,  0.6869],
        [ 0.4539, -0.0964]], grad_fn=<EmbeddingBackward0>)
torch.Size([2, 2])


In [39]:
import torch
"""
If we want to compute dot product of two matrices,
they should adhere to a rule - the dimensions of the matrices should be of the format - lxm, mxn
the resulting matrix will be of lxn

Let matrix A be
| 1,2 |
| 3,4 |
| 5,6 |

Let matrix B be
| 7  8   9  |
| 10 11  12 |

A@B = (1*7)+(2*10); (1*8)+(2*11); (1*9)+(2*12)
      (3*7)+(4*10); (3*8)+(4*11); (3*9)+(4*12)
      (5*7)+(6*10); (5*8)+(6*11); (5*9)+(6*12)

    = | 27  30  33 |
      | 61  68  75 |
      | 95 106 117 |
"""

a = torch.tensor(([1,2],[3,4],[5,6]))
b = torch.tensor(([7,8,9],[10,11,12]))
print(a @ b)
print(torch.matmul(a,b))

tensor([[ 27,  30,  33],
        [ 61,  68,  75],
        [ 95, 106, 117]])
tensor([[ 27,  30,  33],
        [ 61,  68,  75],
        [ 95, 106, 117]])
