### 2.3. Linear Algebra

*This section offers a gentle introduction to the most essential concepts, starting from scalar arithmetic and ramping up to matrix multiplication*

#### 2.3.1. Scalars

In [180]:
import torch
x = torch.tensor([3.0, 2.0])
y = torch.tensor(2.0)
x+y, x*y, x/y, x**y

(tensor([5., 4.]),
 tensor([6., 4.]),
 tensor([1.5000, 1.0000]),
 tensor([9., 4.]))

#### 2.3.2. Vectors

In [181]:
# torch.arange(start, end, step)
x = torch.arange(0,3,1)
print(x, x.shape, x.T.shape)
print(f"dimVector = {len(x)} and x.shape = 3x1")

tensor([0, 1, 2]) torch.Size([3]) torch.Size([3])
dimVector = 3 and x.shape = 3x1


In [182]:
y = torch.arange(3).reshape(3, 1)
print(x * y)

tensor([[0, 0, 0],
        [0, 1, 2],
        [0, 2, 4]])


#### 2.3.3. Matrices

In [183]:
A = torch.arange(6).reshape(3, 2)
print(A)
print(A.T)

# Symmetric matrices A = A.T
A = torch.tensor([[1, 2, 3], [2, 0, 4], [3, 4, 5]])
print(A)
print(A==A.T)

tensor([[0, 1],
        [2, 3],
        [4, 5]])
tensor([[0, 2, 4],
        [1, 3, 5]])
tensor([[1, 2, 3],
        [2, 0, 4],
        [3, 4, 5]])
tensor([[True, True, True],
        [True, True, True],
        [True, True, True]])


#### 2.3.4. Tensors

In [184]:
# Tensors give us a generic way of describing extensions to n_th-order arrays
torch.arange(24).reshape([2, 3, 4])

tensor([[[ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11]],

        [[12, 13, 14, 15],
         [16, 17, 18, 19],
         [20, 21, 22, 23]]])

#### 2.3.5. Basic Properties of Tensor Arithmetic

In [185]:
A = torch.arange(6, dtype=torch.float32).reshape(2, 3)
B = A.clone()  # Assign a copy of A to B by allocating new memory
A, A + B

(tensor([[0., 1., 2.],
         [3., 4., 5.]]),
 tensor([[ 0.,  2.,  4.],
         [ 6.,  8., 10.]]))

In [186]:
print(A.shape, B.shape)
# elementwise product of two matrices, Hadamard product
A * B

torch.Size([2, 3]) torch.Size([2, 3])


tensor([[ 0.,  1.,  4.],
        [ 9., 16., 25.]])

In [187]:
a = 2
X = torch.arange(24).reshape(2, 3, 4)
a + X, (a * X).shape

(tensor([[[ 2,  3,  4,  5],
          [ 6,  7,  8,  9],
          [10, 11, 12, 13]],
 
         [[14, 15, 16, 17],
          [18, 19, 20, 21],
          [22, 23, 24, 25]]]),
 torch.Size([2, 3, 4]))

#### 2.3.6. Reduction

In [188]:
x = torch.arange(3, dtype=torch.float32)
x, x.sum()

(tensor([0., 1., 2.]), tensor(3.))

In [189]:
print(A.shape, A.sum(axis=1).shape)
A.shape, A.sum(axis=1).shape, A, A.sum(axis=1) # -> axis = 1, keep col

torch.Size([2, 3]) torch.Size([2])


(torch.Size([2, 3]),
 torch.Size([2]),
 tensor([[0., 1., 2.],
         [3., 4., 5.]]),
 tensor([ 3., 12.]))

In [190]:
print(A.mean(), A.sum() / A.numel(), A.shape[0])
A.mean(axis=0), A.sum(axis=0) / A.shape[0] # row

tensor(2.5000) tensor(2.5000) 2


(tensor([1.5000, 2.5000, 3.5000]), tensor([1.5000, 2.5000, 3.5000]))

#### 2.3.7. Non-Reduction Sum

In [191]:
sum_A = A.sum(axis=1, keepdims=True)
sum_A, sum_A.shape

(tensor([[ 3.],
         [12.]]),
 torch.Size([2, 1]))

In [192]:
A / sum_A

tensor([[0.0000, 0.3333, 0.6667],
        [0.2500, 0.3333, 0.4167]])

In [193]:
tA = torch.arange(27).reshape([3, 3, 3])
print(tA)

# row by row, first shape is instill
tA.cumsum(axis=0)

tensor([[[ 0,  1,  2],
         [ 3,  4,  5],
         [ 6,  7,  8]],

        [[ 9, 10, 11],
         [12, 13, 14],
         [15, 16, 17]],

        [[18, 19, 20],
         [21, 22, 23],
         [24, 25, 26]]])


tensor([[[ 0,  1,  2],
         [ 3,  4,  5],
         [ 6,  7,  8]],

        [[ 9, 11, 13],
         [15, 17, 19],
         [21, 23, 25]],

        [[27, 30, 33],
         [36, 39, 42],
         [45, 48, 51]]])

#### 2.3.8. Dot Products

**Given two vectors x, y**

In [194]:
# torch.dot(x, y) = sum (x.T * y)
y = torch.ones(3, dtype = torch.float32)

# vector: x = x.T
print(x.T, x==x.T)
print((x.T * y).sum())
x, y, torch.dot(x, y)


tensor([0., 1., 2.]) tensor([True, True, True])
tensor(3.)


(tensor([0., 1., 2.]), tensor([1., 1., 1.]), tensor(3.))

#### 2.3.9. Matrix–Vector Products

**Given mxn matrix and n-dim vector**

In [195]:
A.shape, x.shape, torch.mv(A, x), A@x

(torch.Size([2, 3]), torch.Size([3]), tensor([ 5., 14.]), tensor([ 5., 14.]))

In [196]:
# x = 1x3, tA = 3x1
tA = torch.arange(3, dtype=torch.float32).reshape(3, 1)
x@tA

tensor([5.])

#### 2.3.10. Matrix–Matrix Multiplication

In [197]:
B = torch.ones(3, 4)
torch.mm(A, B), A@B

(tensor([[ 3.,  3.,  3.,  3.],
         [12., 12., 12., 12.]]),
 tensor([[ 3.,  3.,  3.,  3.],
         [12., 12., 12., 12.]]))

#### 2.3.11. Norms

- Vector $ \bold{x}: l_p = \|x\|_p = (\sum_{i=1}^{n} |x_i|^p)^{1/p}$
- Frobenius norm - Matrix $ \bold{X}:  \|X\|_F = \sqrt(\sum_{i=1}^{m}\sum_{j=1}^{n}x_{ij}^2)$

In [198]:
print(torch.ones((4, 9)))

# sqrt(4*9*1^2)
torch.norm(torch.ones((4, 9)))

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1.]])


tensor(6.)

In deep learning, we are often trying to solve optimization problems
- maximize the probability assigned to observed data;

- maximize the revenue associated with a recommender model; 

- minimize the distance between predictions and the ground truth observations; 

- minimize the distance between representations of photos of the same person while maximizing the distance between representations of photos of different people.

- these distances, which constitute the objectives of deep learning algorithms, are often expressed as norms.

#### 2.3.12. Discussion

#### 2.3.13. Exercises

In [199]:
# Ex 1: Prove (A.T).T == A
A = torch.rand(2, 3)
(A.T).T == A

tensor([[True, True, True],
        [True, True, True]])

In [200]:
# Ex 2: A.T + B.T == (A+B).T
A = torch.rand(2, 3)
B = torch.rand(2, 1)
A.T + B.T == (A+B).T

tensor([[True, True],
        [True, True],
        [True, True]])

In [201]:
# Ex 3: Is A + A.T symmetric?
A = torch.randint(3, 5, (3, 3))
T = A + A.T
T == T.T

tensor([[True, True, True],
        [True, True, True],
        [True, True, True]])

In [202]:
# Ex 4: len(X) = 2 (WOW)
X = torch.randint(0, 2, (2, 3, 4))
print(X)
print(X.sum(axis = 0))
print(X.sum(axis = 0).sum(axis = 1))
len(X)

tensor([[[0, 0, 1, 1],
         [1, 0, 1, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [1, 1, 1, 1],
         [0, 0, 1, 0]]])
tensor([[0, 0, 1, 1],
        [2, 1, 2, 1],
        [0, 0, 1, 0]])
tensor([2, 6, 1])


2

In [203]:
# Ex 5 -> len(X) always correspond to the length of axis 0 -> the first dim of matrix

# Ex 6: Run A / A.sum(axis = 1)
# shape: row x col, sum -> asix, keep only that attributor -> sum by other attributor
print(A, A.sum(axis = 1)) # sum by row
A / A.sum(axis = 1)

tensor([[3, 3, 4],
        [4, 3, 4],
        [4, 4, 3]]) tensor([10, 11, 11])


tensor([[0.3000, 0.2727, 0.3636],
        [0.4000, 0.2727, 0.3636],
        [0.4000, 0.3636, 0.2727]])

In [204]:
# Ex 7: Travel in Manhattan -> use Norm L1 = |x1 - x2| + |y1 - y2|
# Ex 8: Sum (2, 3, 4): -> 2 + 3 + 4 = 9
X = torch.randint(0, 2, (2, 3, 4))
print(X.shape[0] + X.shape[1] + X.shape[2])

9


In [205]:
# Ex 9: Norm L2
X = torch.arange(0,9,dtype=torch.float).reshape(3, 3)
print(X)
res = 0
for i in X:
    for j in i:
        res += j * j

print(torch.linalg.norm(X) == res**0.5)

tensor([[0., 1., 2.],
        [3., 4., 5.],
        [6., 7., 8.]])
tensor(True)


In [206]:
# Ex 10: A in R^{2^10 x 2^16}, B in R^{2^16 x 2^5}, C in R^{2^5 x 2^14}
'''
m x n, n x p
torch.mm(A, B) or A@B
-> compute (m * p) * (n muls + n adds)

-> AB in R^{2^10 x 2^5}, C in R^{2^5 x 2^14}
-> A in R^{2^10 x 2^16}, BC in R^{2^16 x 2^14}

=> O((AB)C) = 2^(10 + 14 + 2*5) = 2^34 
=> O(A(BC)) = 2^(10 + 5 + 2*16) = 2^47
    
'''
A = torch.rand(2, 3)
B = torch.rand(3, 2)
Ans = torch.zeros((2, 2))
print(A, "\n", B)
for i in range(2):
    for j in range(2):
        Ans[i][j] = 0
        for k in range(3):
            Ans[i][j] += A[i][k] * B[k][j]
print(Ans == torch.mm(A, B))
torch.mm(A, B)

tensor([[0.1974, 0.0648, 0.2604],
        [0.6919, 0.7498, 0.0917]]) 
 tensor([[0.3804, 0.9647],
        [0.0453, 0.8638],
        [0.2342, 0.0497]])
tensor([[True, True],
        [True, True]])


tensor([[0.1390, 0.2593],
        [0.3186, 1.3197]])

In [207]:
# Ex 11 -> Slightly different
%time
A = torch.rand(20, 30)
B = torch.rand(30, 40)
A@B

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.01 µs


tensor([[ 8.1512,  8.3738,  6.8723,  7.5550,  6.4670,  8.4816,  8.0702,  6.3920,
          7.2685,  8.2481,  8.4397,  6.4160,  7.5176,  8.1775,  8.5091,  6.7916,
          7.0622,  7.5079,  7.0922,  7.4095,  7.0315,  8.4757,  8.7261,  6.9682,
          9.5741,  8.0055,  6.7795,  8.6091,  8.8621,  8.3429,  6.6739,  8.3045,
          8.4317,  7.6374,  6.3798,  6.7931,  7.8707,  7.5754,  7.8255,  8.9366],
        [10.3036,  9.5763,  7.7581,  8.3642,  7.5685,  9.1082,  7.9427,  6.9862,
          9.3636,  8.7989,  8.7772,  9.0248,  7.8074, 10.1574, 10.7980,  8.3722,
          7.4362,  8.1263,  8.0210,  9.9596, 10.0800, 10.0313,  9.8257,  6.9680,
         10.6238, 10.1832,  8.3637,  9.3602,  9.9949,  9.2636,  8.9047,  9.2404,
          9.3598,  8.9625,  8.7506,  8.7501,  9.5070,  8.7402,  9.5386,  8.3707],
        [ 8.9740,  8.7335,  7.0516,  7.9406,  7.2479,  8.3149,  7.8493,  6.7739,
          9.0226,  8.4924,  9.0769,  8.5185,  7.9147,  9.0327,  9.9033,  7.4677,
          7.2303,  8.0544,

In [215]:
%time
import tracemalloc
tracemalloc.start()
A = torch.rand(20, 30)
B = torch.rand(40, 30)
C = B # without clone (1692, 12040) is better in memory efficiency,
      # with clone(2364, 12708)
print(tracemalloc.get_traced_memory())
tracemalloc.stop()
A@C.T

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs
(1460, 11808)


tensor([[ 9.0020,  7.1044,  8.0900, 10.2174,  7.8628,  7.9131,  8.1432,  7.8910,
          7.5302, 10.4786,  9.5091,  9.7601,  8.9581,  9.2501,  8.3976, 10.2384,
          8.3071,  8.1910,  8.7037,  8.5819,  9.1828,  8.4556, 10.0694,  9.4204,
          7.4288,  9.2304,  8.1929,  8.9815,  7.6031,  8.5688, 10.0139,  9.8744,
          7.4123,  6.3364,  9.1461,  9.2136,  9.1498,  6.9270, 10.1612,  7.7086],
        [ 7.3869,  6.3785,  6.5397,  7.7877,  6.6173,  7.0034,  6.2168,  5.9456,
          6.5489,  7.7066,  7.5706,  7.6930,  7.3179,  7.5374,  6.7183,  6.3627,
          6.1777,  6.6612,  6.6948,  7.3187,  8.1783,  6.1424,  7.7090,  6.9271,
          5.9810,  8.0049,  7.4158,  7.9416,  6.1705,  6.9888,  7.8281,  6.5267,
          5.7627,  5.6432,  7.1772,  6.7039,  8.4846,  5.3190,  7.1828,  6.5347],
        [ 8.9471,  6.5951,  7.6864,  9.6427,  7.3254,  7.0270,  6.5421,  7.3427,
          6.4209,  9.0225,  8.9264,  8.4371,  8.1694,  8.9754,  8.4848,  7.8483,
          7.1690,  8.1201,

In [216]:
# Ex 12
A = torch.rand(100, 200)
B = torch.rand(100, 200)
C = torch.rand(100, 200)
D = torch.stack([A, B, C])
D.shape

torch.Size([3, 100, 200])

In [217]:
D[0] == A, D[1] == B, D[2] == C

(tensor([[True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         ...,
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True]]),
 tensor([[True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         ...,
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True]]),
 tensor([[True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         ...,
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True]])