In [2]:
#Intro to Tensors In Pytorch / Numpy

import torch #overall lib
print(torch.__name__) #prints out attributes
print(torch.__version__) #prints the torch version

#a tensor is just a multi-dimentional array. (a little more complex but whatevs)



torch
2.5.1


In [3]:
#lets create a scalar in pytorch
scalar = torch.tensor(data=1,dtype=torch.int32) #defaults to teh number provided. torch.tensor(the factory function to create a tensor from data) vs torch.Tensor (the class)
print(f"the scalar is {scalar}, with dimensions: {scalar.ndim}, shape {scalar.shape}, and type {scalar.dtype}, can return item with {scalar.item()}")
#note: this will return dim 0 but data = [1] is dim 1 (not a scalar anymore)




the scalar is 1, with dimensions: 0, shape torch.Size([]), and type torch.int32, can return item with 1


In [4]:
#tensors: Rank/Order = in linalg the number of vectors in the basis of column space or row space(for a matrix) but in the context or tensors its just
#another way of saying dim. ie a dim 3 tensors means its Rank is also 3

vector = torch.tensor(data=[1,2,3,4]) #NOTE: in Linalg, this vector is Dim4 since it is is R4, however we consider this a dim 1 object
vector.ndim,vector.shape #predict ndim is 1, and shape is [4], note that torch.Size is an Listlike object (indexable)



(1, torch.Size([4]))

In [5]:
#matrix:

MATRIX = torch.tensor(data=[[1,2],[3,4]],dtype=torch.int32) #will be 2dim object, also by convention make it all caps
MATRIX.ndim, MATRIX.shape # note the indexes in row major order

(2, torch.Size([2, 2]))

In [6]:
#TENSOR
#broadly, torch.tensor is just a function that makes tensors, but in the case that the data is a tensor itself, like torch.tensor(x)
#it is preferred to do x.clone().detach() clone means they do not share memory with the og tensor, and detach means they are detached from the computational
#graph
#torch.tensor is just torch.Tensor.clone().detach()
#torch.as_tensor() is better when you dont want to copy the data, and can instead wrap it as a tensor (usually a list ndim list or numpy array or something)

#If I use x in some computation that leads to a scalar loss, and I call loss.backward(), PyTorch should compute x.grad. (thats what requiresgrad means)
#it auto calls loss.backwards() on the tensors when computing the grad

#exe:
a = torch.randn(size=(2,2),requires_grad=True,dtype=torch.float32)
b = a * 2
c = b.sum()
c.backward()
print(a.grad) #cannot do b.grad since b is not a leaf node, and autograd only calculates grad relative to leaf nodes
print(a.grad_fn)


TENSOR = torch.tensor([[[1,2],[3,4]],[[5,6],[7,8]]]) #size 2,2,2
TENSOR.shape, TENSOR.ndim

tensor([[2., 2.],
        [2., 2.]])
None


(torch.Size([2, 2, 2]), 3)

In [7]:
#ML process starts with random tensor and slowly adjusts it based on data.
#first initialize a random tensor of values, then based on some data adjust the tensors so they better reflect patterns in the data

random_tensor = torch.rand(size=(3,2))
random_tensor.shape,random_tensor.ndim

(torch.Size([3, 2]), 2)

In [8]:
#zero tensor:
zeros = torch.zeros(size=(3,2))
zeros_like = torch.zeros_like(random_tensor)

#one tensors
ones = torch.ones(size=(3,2))
ones_like = torch.ones_like(ones)

In [9]:
#use the arange function to make a 1dim tensor of a list/range
import numpy as np
elements = torch.arange(start=1,end=10,step=2)
#as a numpy array:
nump = np.array(elements) #thats how to do it
elements.device

#all useful params
elements,elements.shape, elements.ndim, elements.dtype, elements.device


(tensor([1, 3, 5, 7, 9]), torch.Size([5]), 1, torch.int64, device(type='cpu'))

In [None]:
#tensor operations

a = torch.randint(low=1,high=10,size=(3,2)) # doesn't appear the autocomplete
b = torch.randint_like(a,low=1,high=10)
#addition
print(a,"\n\n",b)
a + b

#multiplication:
a * b #elementwise, can also use torch.mul and torch.add to do same

#matrix multiplication:
torch.matmul(a,b.T) #ensure dims match

tensor([[9, 4],
        [5, 7],
        [2, 7]]) 

 tensor([[7, 4],
        [6, 9],
        [4, 9]])


tensor([[79, 90, 72],
        [63, 93, 83],
        [42, 75, 71]])

In [None]:
#Broadcasting

#operations like addition and multiplication can be done for any matricies whose sizes are broadcastable

#broadcastable, if all dims are equal or one of them is equal to one:
#exe size (3,2) and (3,1) are broadcastable, since just copies the (3,1) to (3,2) easily
#exe size (3) and (3,2) also works since can just unsqueeze then broadcast

a = torch.randint(low=1,high=10,size=(3,2))
b = torch.randn(size=(2,)) #NOTE: pytorch PREPENDS but doesnt postpend, so (3,2) and (2,) is good, but not (3,2) , (3)!

#if you want that to work, you will have to manually unqsqueeze dim=1 and then it should work

#how to go from tensor to int tuple: 
torch.dot(torch.flatten(torch.tensor(a,dtype=torch.float)),torch.flatten(b.broadcast_to(size=a.shape)))

#Transpose vs Permute: permute reorders


c = torch.randn(size=(2,3,4)) 
print(c)
torch.transpose(c,dim0=1,dim1=2) #transpose .T works only when 2 dims, other transpose just swaps two dims, and permute swaps all dim
#Functionally, whats happening is a change in stride!

print(c)

torch.mul(a,b)

tensor([[[ 0.1216,  0.5397, -0.3450, -1.7290],
         [ 0.7293,  0.9217, -1.3062,  2.0650],
         [-0.6160,  0.7263, -0.9313,  0.5312]],

        [[-0.0391,  0.8063,  0.2621,  1.2398],
         [ 0.0600,  0.2732, -0.3274, -0.5025],
         [ 0.2304, -0.8531, -0.1790,  0.4482]]])
tensor([[[ 0.1216,  0.5397, -0.3450, -1.7290],
         [ 0.7293,  0.9217, -1.3062,  2.0650],
         [-0.6160,  0.7263, -0.9313,  0.5312]],

        [[-0.0391,  0.8063,  0.2621,  1.2398],
         [ 0.0600,  0.2732, -0.3274, -0.5025],
         [ 0.2304, -0.8531, -0.1790,  0.4482]]])


  torch.dot(torch.flatten(torch.tensor(a,dtype=torch.float)),torch.flatten(b.broadcast_to(size=a.shape)))


tensor([[ 2.2312, -5.7378],
        [11.1559, -1.6394],
        [15.6182, -6.5575]])

In [None]:
#Stride and Storage in memory explanation

# reshape changes the stride aswell, just keeps it in order

#exe: lets suppose you create some tensor
torch.manual_seed(42)
random_tensor = torch.tensor(data=[[[1,2],[3,4]],[[5,6],[7,8]]]) #this is a size 2,2,2 tensor, 
random_tensor

#However, in memory it is stored continuously in row-major order like this (since memory is 1D)
#Row major order just means that the highest dimension moves the fastest, so in order
# tensor[0][0][0], [0][0][1], [0][1][0], [0][1][1] ...
#the "highest dimension", dim=2 is the one on the end, which corresponds to which part in the yellow it is
# so, in memory it is stored as:
# 1,2,3,4,5,6,7,8 

#now, whenever you do any tensor manipulations to it, be that a transpose, a permute, a reshape, or whatever, we want to avoid
#making a new tensor since that is very costly, O(n), since we will anyways no longer need the og tensor.
#instead, we can get away with just changing the stride!

#functionally, when you write rand_tensor[0][1][0], what the computer does, like for arrays, it adds some offset to each index
# or &rand_tensor[0] + 0(2*2) + 1 * (2) + 0(1), where the parenthesis the "stride", corresponding to how many indices down the list
# you gotta move to find it

#in a reshape/view, you are just changing the strides, ensuring that the number of elements is conserved. 
#sidenote: reshape is just a view that will copy if necessary (ie if no longer in row major)
# so, for a (2,2,2), you can very easily create a new tensor using tensor.reshape, which can share the data with the og tensor, only
#instead it has a different stride.

#IN a transpose however, you are making the strides non-contiguous, like (1,2,4), which means it is no longer in row-major order

#MASSIVE NOTE: if from here, ie after a transpose/permute, you descide to reshape, it is forced to copy the data since 
#if you just change the stride, it will be the reshape of the untransposed tensor, not the new one:

#exe: initially stride is (4,2,1) for
# 1,2,3,4,5,6,7,8
#if you permute (2,1,0), new stride is (1,2,4) so 
#[0][0][1] will be the 4th element down the list, or 5, but the data is unchanged
#however, if i now ask for a resize, and you just change the stride as per usual, it will not be correct since (8,1,1) for exe will 
#only be correct for the og tensor, so you have to copy the data back in to row-major order, and THEN apply the reshape




#

tensor([[[1, 2],
         [3, 4]],

        [[5, 6],
         [7, 8]]])

In [None]:
#Matrix Multiplication

#for transpose, torch.transpose(x,dim0,dim1) to swap them, or .T

a = torch.randint(low=1,high=10,size=(3,2))
b = torch.randint_like(a,low=a.min(),high=a.max())
print(a,b)

c = a.matmul(b.T)

#Matmul in higher dims:

#for (10,3,2) matmul by (10,2,3), treated as 10 matrix multiplications, meaning resulting object is (10,2,2) 
# for for (2,3,4,5,6) times (2,3,4,6,5), it results in (2,3,4,5,5) or a complex tensor or matrix multiplications

tensor([[3, 4],
        [3, 8],
        [1, 6]]) tensor([[5, 3],
        [7, 1],
        [5, 5]])


In [None]:
#now for general torch functions

a = torch.arange(start=1,end=20,step=2)
a.min(),a.max(), a.sum(),torch.tensor(a,dtype=torch.float).mean() # these return tensors themselves, useful for backprop
# a.backward() will calculate teh a.grad() for each of the values

#argmax, what it does is it returns the index of the largest value in a tensor

a.argmax().item() # element 9 has the largest value

#for 2d tensor
b,b.shape,b.argmax(dim=1),b.argmax(dim=0) #argmax by itself, not sure what it does in higher dims, but when you specify a dimension, it flattens the
#tensor in that dimension, and instead replaces it with the subtensor with teh index of the larger value


#ie for b.argmax(dim=1), assuming teh object is of shape (3,2,2)
#for the dimensions lower than dim, it is just describing the shape of teh object, ie it will repeat the task for all of them
#for the dimension in question, it will vary it whilst keeping the higher dims constant, and then select the max value, and set it at
#the new position (upperdim,lower dim, lower dim) with the dim in question missing

#exe
a = torch.randint(size=(3,2,4),low=1,high=11)
a,a.argmax(dim=1)

#here dim=1 is the dim in question, so the 0th dimension will just be kept as is and this process will be done for all iterations of it
#on the subtensors, so output is (3,...)

# then at the dim in question REMEMBER IT IS NOT ASKING TO FIND THE BIGGEST ELEMENT IN EACH ROW, BUT THE BIGGEST ELEMENT ACROSS ROWS
#thus it will start with [0][i][0] and for all i's possible, and find the max. Then, this max will be put at (0,0)
#then it will try  [0][i][1], find teh max and put it at [0][1] of the result
# it will repeat this process untill it obtains 

# exe
#start with 0 for 0th dim, compare ACROSS rows
#10 > 4, so [3,0] is 1, 9 > 8 so [3,1] is 1 untill you get [1,1,0,0]
#then repeat with [1] and [2] and thats the result




  a.min(),a.max(), a.sum(),torch.tensor(a,dtype=torch.float).mean() # these return tensors themselves, useful for backprop


(tensor([[[ 4,  8,  4,  9],
          [10,  9,  3,  6]],
 
         [[ 6,  5,  9,  2],
          [ 8, 10,  3,  7]],
 
         [[ 9,  3,  9,  3],
          [ 2,  1,  1,  7]]]),
 tensor([[1, 1, 0, 0],
         [1, 1, 0, 1],
         [0, 0, 0, 1]]))

In [None]:
#same with stack

In [None]:
#reshape, view, transpose, stack, unsqueeze, squeeze, from_numpy, 
torch.manual_seed(42)
tensor = torch.randint(size=(3,2),low=1,high=14)
tensor2 = tensor.type(torch.float32)# NOTE: use type to return teh tensor just with a new type

tensor1 = torch.randn(size=(4,3,2))
print(tensor1)
tensor2 = tensor1.reshape(shape=(12,2)) #perfectly valid, doesn't need to keep dimensions
print(tensor2)
tensor3 = torch.Tensor.view(tensor1,size=(12,2))
print(tensor3)

#as for stacking, that works by inserting a dimension and moving the other ones aside. so when stacking two (3,2)s at dim=1
#you will be creating a (3)


tensor([[[-0.0431, -1.6047],
         [ 1.7878, -0.4780],
         [-0.2429, -0.9342]],

        [[-0.2483, -1.2082],
         [-2.3169, -0.2168],
         [-1.3847, -0.8712]],

        [[-0.2234,  1.7174],
         [ 0.3189, -0.4245],
         [-0.8286,  0.3309]],

        [[-1.5576,  0.9956],
         [-0.8798, -0.6011],
         [-1.2742,  2.1228]]])
tensor([[-0.0431, -1.6047],
        [ 1.7878, -0.4780],
        [-0.2429, -0.9342],
        [-0.2483, -1.2082],
        [-2.3169, -0.2168],
        [-1.3847, -0.8712],
        [-0.2234,  1.7174],
        [ 0.3189, -0.4245],
        [-0.8286,  0.3309],
        [-1.5576,  0.9956],
        [-0.8798, -0.6011],
        [-1.2742,  2.1228]])
tensor([[-0.0431, -1.6047],
        [ 1.7878, -0.4780],
        [-0.2429, -0.9342],
        [-0.2483, -1.2082],
        [-2.3169, -0.2168],
        [-1.3847, -0.8712],
        [-0.2234,  1.7174],
        [ 0.3189, -0.4245],
        [-0.8286,  0.3309],
        [-1.5576,  0.9956],
        [-0.8798, -0.6011],

In [None]:
#also, the == or .eq is useful for obtaining a tensor of booleans (useful for accuracy function and stuff like that)

In [None]:
#Now finally for the linear layer:

torch.manual_seed(42)

layer = torch.nn.Linear(in_features=2,out_features=6) #stores a 6 by 2 matrix internally, then multiplies by the transpose or a (2,6) which
#takes a matrix with 2 features to 6 featues in the last element
x = torch.randn(size=(3,2))

#functionally, then input, is a tensor, but can really be thought of as a bunch of vectors of size 2 (the input size)
# from here, we will be multiplying that vector by the weights
print(x)
print(layer.state_dict())
output = layer(x)
print(output)

output_1_same = 0.3930*0.5406 + 0.4327*0.5859 + 0.5224
print(f"{output_1_same} should be the same as {output[0][0]}, which it {"is" if abs(output[0][0] - output_1_same) < 0.01 else "isnt"}")


#what this does is x * wT + b

tensor([[ 0.3930,  0.4327],
        [-1.3627,  1.3564],
        [ 0.6688, -0.7077]])
OrderedDict({'weight': tensor([[ 0.5406,  0.5869],
        [-0.1657,  0.6496],
        [-0.1549,  0.1427],
        [-0.3443,  0.4153],
        [ 0.6233, -0.5188],
        [ 0.6146,  0.1323]]), 'bias': tensor([ 0.5224,  0.0958,  0.3410, -0.0998,  0.5451,  0.1045])})
tensor([[ 0.9888,  0.3117,  0.3418, -0.0555,  0.5656,  0.4033],
        [ 0.5818,  1.2026,  0.7456,  0.9326, -1.0080, -0.5535],
        [ 0.4686, -0.4747,  0.1364, -0.6240,  1.3291,  0.4219]],
       grad_fn=<AddmmBackward0>)
0.98837473 should be the same as 0.9888474941253662, which is is


In [147]:
#also, can use .numpy to turn tensor into numpy array
#lets just see the effects of a linear transform on linear data:

#in linalg, what does it mean to be transformed in this way? well it is a change in basis, functionally

#suppose you have some matrix:
torch.manual_seed(42)

MATRIX = torch.randint(low=1,high=11,size=(5,4)).type(torch.float32)
print(MATRIX)
#and
WEIGHTS = torch.randn_like(MATRIX).T
print(WEIGHTS)

tensor([[3., 8., 7., 5.],
        [7., 6., 1., 5.],
        [1., 4., 9., 5.],
        [1., 5., 2., 3.],
        [6., 6., 8., 7.]])
tensor([[-0.7581,  0.0349,  1.3123, -1.4181,  1.1790],
        [ 1.0783,  0.3211,  0.6872,  0.8963, -0.4345],
        [ 0.8008,  1.5736, -1.0892,  0.0499, -1.3864],
        [ 1.6806, -0.8455, -0.3553,  2.2667, -1.2862]])


In [153]:
#HW:
#1: random tensor shape 7,7
torch.manual_seed(0)
random_tensor = torch.rand(size=(7,7))
print(random_tensor)

#mat mult by another tensors size (1,7)
another_random = torch.rand(size=(1,7))

resulting_tensor = random_tensor.matmul(another_random.T)
print(resulting_tensor)

tensor([[0.4963, 0.7682, 0.0885, 0.1320, 0.3074, 0.6341, 0.4901],
        [0.8964, 0.4556, 0.6323, 0.3489, 0.4017, 0.0223, 0.1689],
        [0.2939, 0.5185, 0.6977, 0.8000, 0.1610, 0.2823, 0.6816],
        [0.9152, 0.3971, 0.8742, 0.4194, 0.5529, 0.9527, 0.0362],
        [0.1852, 0.3734, 0.3051, 0.9320, 0.1759, 0.2698, 0.1507],
        [0.0317, 0.2081, 0.9298, 0.7231, 0.7423, 0.5263, 0.2437],
        [0.5846, 0.0332, 0.1387, 0.2422, 0.8155, 0.7932, 0.2783]])
tensor([[1.8542],
        [1.9611],
        [2.2884],
        [3.0481],
        [1.7067],
        [2.5290],
        [1.7989]])
