<a href="https://colab.research.google.com/github/SaieshNeeli/genai/blob/main/Day1_numpy_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Numpy Tutorial
[NumPy](https://numpy.org/) is a Python library, which adds support for large, multi-dimensional arrays and matrices, along with a large collection of optimized, high-level mathematical functions to operate on these arrays.

In [None]:
import numpy as np
import pprint

## Efficient NumPy Code

When working with numpy arrays, avoid explicit for-loops over indices/axes at all costs. For-loops will dramatically slow down your code (~10-100x).

In [None]:
x = np.random.rand(1000, 1000)
print(x)
print(x.shape)

[[0.2456875  0.1999786  0.64747432 ... 0.29006748 0.3774657  0.85713239]
 [0.96787811 0.4082571  0.58121968 ... 0.02325002 0.24716104 0.46117591]
 [0.4775976  0.88743594 0.37803976 ... 0.51721694 0.58756425 0.61240829]
 ...
 [0.75755607 0.5165391  0.21173416 ... 0.64232976 0.47783265 0.0112866 ]
 [0.86705826 0.03770885 0.28028072 ... 0.72098193 0.14572308 0.84383069]
 [0.28313444 0.08331775 0.56625682 ... 0.98970031 0.39311062 0.00314721]]
(1000, 1000)


In [None]:
%%timeit
x = np.random.rand(1000, 1000)
for i in range(100, 1000):
    for j in range(x.shape[1]):
        x[i, j] += 5

304 ms ± 7.56 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%%timeit
x = np.random.rand(1000, 1000)
x[np.arange(100,1000), :] += 5

13.5 ms ± 2.3 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


## NumPy Arrays


In [None]:
# Create numpy arrays from lists
a = [1, 2, 3, 4, 5]  # normal python list
numpy_a = np.array([1, 2, 3, 4, 5], dtype='int16') # numpy 1d array

In [None]:
# Let's take a look at their shapes.
# When working with numpy arrays, .shape will be a very useful debugging tool
print(numpy_a)
print(numpy_a.shape)

[1 2 3 4 5]
(5,)


In [None]:
# numpy 2d array
b = np.array([
    [9.0,8.0,7.0],
    [6.0,5.0,4.0]
])
# a = np.copy(b[:,:])

In [None]:
print(b.shape)

(2, 3)


In [None]:
# Get Dimensions
print(b.ndim)

# Get Type
print(b.dtype)

# Get Size
print(b.nbytes)

2
float64
48


In [None]:
arr = np.array([1, 2, 3], ndmin=2)
print(arr.shape)
print(arr)

(1, 3)
[[1 2 3]]


# More types of NumPy initializations

In [None]:
# All 0s matrix
print(np.zeros((5, 6, 3)))

[[[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]]


In [None]:
# All 1s matrix
np.ones((3, 3))

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [None]:
# Matrix filled with any number
np.full((5,2), 4)

array([[4, 4],
       [4, 4],
       [4, 4],
       [4, 4],
       [4, 4]])

In [None]:
# identity matrix I
# A X I = A, I X A = A
print(np.eye(5))

[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]


In [None]:
# Random decimal numbers between 0-10
np.random.rand(4,2) * 10

array([[6.74557245, 8.6344172 ],
       [3.98169006, 0.12358447],
       [6.04129613, 8.27250044],
       [0.93756428, 6.51998731]])

## NumPy Gotchas!

In [None]:
# python gotcha
a = np.array([1,2,3])
b = a

print(a)
print(b)

[1 2 3]
[1 2 3]


In [None]:
b[1] = 100
print(b)

[  1 100   3]


In [None]:
print(a)

[  1 100   3]


In [None]:
# Copying in numpy
a = np.array([1,2,3])
b = a.copy() # REMEMBER TO DO THIS. Otherwise, when b is mutated, a also changes
# b = np.copy(a)
# b = a[:]

b[1] = 100
print(a)
print(b)

[1 2 3]
[  1 100   3]


## NumPy Indexing, Slicing, and Accessing Elements

In [None]:
python_list = [
    ['apple', 'orange', 'banana'],
    ['panda', 2, 100]]
python_list[1][2]

100

In [None]:
a = np.array([
    [1,2,3,4,5,6,7],
    [8,9,10,11,12,13,14]
])
# [[ 1  2  1  4  5  6  7]
#  [ 8  9  2 11 12 13 14]]
a.shape

(2, 7)

In [None]:
a[1, 3]

11

In [None]:
a[:1, :-2]

array([[1, 2, 3, 4, 5]])

In [None]:
# Get a specific element [row, col]
# With slicing - [row_start:row_end, col_start:col_end]
# Can leave out row_start / col_start if 0
a[0:2, :-3]

# note: this is first two rows, in this case all rows

array([[ 1,  2,  3,  4],
       [ 8,  9, 10, 11]])

In [None]:
# Get a specific row
row = a[0, :]
print(row)

# Get a specific column
col = a[:, 4]
print(col)

[1 2 3 4 5 6 7]
[ 5 12]


In [None]:
row[3:5]

array([4, 5])

In [None]:
# Modify array
a[:,2] = [1,2]
print(a)

[[ 1  2  1  4  5  6  7]
 [ 8  9  2 11 12 13 14]]


In [None]:
# Example of indexing into 3D array

c = np.array([
    [[1,2],[3,4]],
    [[5,6],[7,8]]
  ])
print(c)

# access 1st row, 1st col, 0th value
print(c[1, 1, 0])

[[[1 2]
  [3 4]]

 [[5 6]
  [7 8]]]
7


In [None]:
# Boolean indexing
x = np.random.random((3, 4))
print(x[x > 0.5])

[0.6449003  0.74496063 0.82832733 0.63558125 0.86252396 0.63698565
 0.90303286 0.5501042  0.76253409]


## NumPy Shape-Shifting

In [None]:
# Concatenating arrays
arr1 = np.array([1,2,3,4])
arr2 = np.array([5,6,7,8])
arr1

array([1, 2, 3, 4])

In [None]:
arr2

array([5, 6, 7, 8])

In [None]:
np.concatenate((arr1, arr2))

array([1, 2, 3, 4, 5, 6, 7, 8])

In [None]:
# Vertically stacking vectors
v1 = np.array([1,2,3,4])
v2 = np.array([5,6,7,8])

np.vstack([v1,v2])

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

In [None]:
h1 = np.ones((4,5))
h1

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

In [None]:
h2 = np.full((4,2), 2)
h2

array([[2, 2],
       [2, 2],
       [2, 2],
       [2, 2]])

In [None]:
# Horizontal  stack
np.hstack((h2,h1))

array([[2., 2., 1., 1., 1., 1., 1.],
       [2., 2., 1., 1., 1., 1., 1.],
       [2., 2., 1., 1., 1., 1., 1.],
       [2., 2., 1., 1., 1., 1., 1.]])

In [None]:
# Reshape array
a = np.full((4, 8), 3)
print(a)

[[3 3 3 3 3 3 3 3]
 [3 3 3 3 3 3 3 3]
 [3 3 3 3 3 3 3 3]
 [3 3 3 3 3 3 3 3]]


In [None]:
a_flat = a.flatten()
print(a_flat)

[3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]


In [None]:
# Reshap a_flat to a rectangle matrix
b = np.reshape(a, (2, 16))
print(b)

[[3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]
 [3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]]


## NumPy Array Operations

In [None]:
a = np.array([1,2,3,4])
print(a)

[1 2 3 4]


In [None]:
# Add value to all elements at once
print(a + 7)

[ 8  9 10 11]


In [None]:
# Multiply all elements in a by value
print(a * 3)

[ 3  6  9 12]


In [None]:
# Divide all elements in a by value
print(a / 2)

[0.5 1.  1.5 2. ]


In [None]:
# Add 2 arrays together
b = np.array([4, 5, 6, 7]) # Needs to be the same shape!

print(a + b)

[ 5  7  9 11]


In [None]:
# Multiply 2 arrays together
print(a * b) # Needs to be the same shape!

[ 4 10 18 28]


In [None]:
# Take the sin / cos of arrays
# For a lot more (https://docs.scipy.org/doc/numpy/reference/routines.math.html)
np.sin(a * b)

array([-0.7568025 , -0.54402111, -0.75098725,  0.27090579])

In [None]:
# Perform linear algebra operations

# shape (2, 2)
a = np.array([
    [1, 2],
    [2, 3]
])

# shape (3, 2)
b = np.array([
    [1, 2],
    [2, 3],
    [3, 3]
])

print(a)
print(b)

[[1 2]
 [2 3]]
[[1 2]
 [2 3]
 [3 3]]


In [None]:
# Error!
c = np.matmul(a, b)

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 3 is different from 2)

In [None]:
b.transpose()
# alternatively
# b.T

array([[1, 2, 3],
       [2, 3, 3]])

In [None]:
c = np.matmul(a, b.transpose()) # (2, 2) x (2, 3) => (2, 3)
# alternatively
# c = a @ b.T
print(c)

[[ 5  8  9]
 [ 8 13 15]]


There are many NumPy operations that can be used to reduce a numpy array along an axis.

Let's look at the np.max operation (documentation: https://numpy.org/doc/stable/reference/generated/numpy.ndarray.max.html).

In [None]:
x = np.array([[1,2],[3,4], [5, 6]])
print(np.max(x, axis = 1))
print(np.max(x, axis = 1).shape)
print(np.max(x, axis = 1, keepdims = True))
print(np.max(x, axis = 1, keepdims = True).shape)
print(np.max(x, axis=0))

[2 4 6]
(3,)
[[2]
 [4]
 [6]]
(3, 1)
[5 6]


Vectors can be represented as 1-D arrays of shape (N,) or 2-D arrays of shape (N, 1) or (1, N). But it's important to note that the shapes (N,), (N, 1), and (1,N) are not the same and may result in different behavior (we'll see some examples below involving broadcasting).

Matrices are generally represented as 2-D arrays of shape (M, N).

The best way to ensure your code gives you the behavior you expect is to keep track of your array shapes and try out small test cases or refer back to documentation when you are unsure.


## Numpy Broadcasting

The term broadcasting describes how NumPy treats arrays with different shapes during arithmetic operations.

General Broadcasting Rules

When operating on two arrays, NumPy compares their shapes element-wise. It starts with the trailing (i.e. rightmost) dimensions and works its way left. Two dimensions are compatible when:

1. they are equal, or
2. one of them is 1 (in which case, elements on the axis are repeated along the dimension)

More details: https://numpy.org/doc/stable/user/basics.broadcasting.html

In [None]:
x = np.random.random((3, 4))

y = np.random.random((3, 1))
z = np.random.random((1, 4))

# In this example, y and z are broadcasted to match the shape of x.
# y is broadcasted along dim 1.
s = x + y
# z is broadcasted along dim 0.
p = x * z
print(x)
print("------")
print(y)
print("------")
print(s)
print("====")
print(x)
print("------")
print(z)
print("------")
print(p)

[[0.92104404 0.30751516 0.2228117  0.28354916]
 [0.46584326 0.40006038 0.69054266 0.21889874]
 [0.49480469 0.42787448 0.24225626 0.27949181]]
------
[[0.91792552]
 [0.80324193]
 [0.78081136]]
------
[[1.83896956 1.22544068 1.14073722 1.20147468]
 [1.26908518 1.2033023  1.49378458 1.02214067]
 [1.27561605 1.20868585 1.02306763 1.06030317]]
====
[[0.92104404 0.30751516 0.2228117  0.28354916]
 [0.46584326 0.40006038 0.69054266 0.21889874]
 [0.49480469 0.42787448 0.24225626 0.27949181]]
------
[[0.20495739 0.33225974 0.87709075 0.18982169]]
------
[[0.18877478 0.10217491 0.19542608 0.05382378]
 [0.09547802 0.13292396 0.60566858 0.04155173]
 [0.10141388 0.14216546 0.21248073 0.05305361]]


In [None]:
t = y * z
print(t)

[[0.11690308 0.1698786  0.21848914 0.17464897]
 [0.37917502 0.55100107 0.70866929 0.56647375]
 [0.38982009 0.56647003 0.72856467 0.5823771 ]]


In [None]:
print(x.shape)
print()
print(y.shape)
print(s.shape)

(3, 4)

(3, 1)
(3, 4)


In [None]:
print(x.shape)
print()
print(s.shape)
print(p.shape)

(3, 4)

(3, 4)
(3, 4)


In [None]:
a = np.zeros((3, 3))
b = np.array([[1, 2, 3]])
print(a)
print()
print(a+b)

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]

[[1. 2. 3.]
 [1. 2. 3.]
 [1. 2. 3.]]


Lets look at a more complex example

In [None]:
a = np.random.random((3, 4))
b = np.random.random((3, 1))
c = np.random.random((3, ))

What is the expected broadcasting behavior for these operations? What do the following operations give us? What are the resulting shapes?

In [None]:
result1 = b + b.T

print(b.shape)
print(b.T.shape)
print(result1.shape)
print(result1)

(3, 1)
(1, 3)
(3, 3)
[[1.07879141 0.71891198 1.02804904]
 [0.71891198 0.35903254 0.66816961]
 [1.02804904 0.66816961 0.97730667]]


In [None]:
result2 = a + c

print(a.shape)
print(c.shape)
print(result2.shape)
print(result2)

ValueError: operands could not be broadcast together with shapes (3,4) (3,) 

In [None]:
result3 = b + c

print(b.shape)
print(c.shape)
print(result3.shape)
print(result3)

(3, 1)
(3,)
(3, 3)
[[1.38438999 1.07108856 0.58142853]
 [1.02451055 0.71120913 0.2215491 ]
 [1.33364762 1.02034619 0.53068616]]


## Load data from file into NumPy

Here is an example on how we might import data using NumPy, but note that in the subsequent week we will use a standard tool Pandas to load that data in a much more robust manner!

In [None]:
filedata = np.genfromtxt('./sample_data/california_housing_train.csv', delimiter=',', skip_header=1)

print(filedata.shape)

(17000, 9)


In [None]:
filedata[:3]

array([[-1.1431e+02,  3.4190e+01,  1.5000e+01,  5.6120e+03,  1.2830e+03,
         1.0150e+03,  4.7200e+02,  1.4936e+00,  6.6900e+04],
       [-1.1447e+02,  3.4400e+01,  1.9000e+01,  7.6500e+03,  1.9010e+03,
         1.1290e+03,  4.6300e+02,  1.8200e+00,  8.0100e+04],
       [-1.1456e+02,  3.3690e+01,  1.7000e+01,  7.2000e+02,  1.7400e+02,
         3.3300e+02,  1.1700e+02,  1.6509e+00,  8.5700e+04]])

In [None]:
total_rooms = filedata[:, 3]
total_rooms.shape

(17000,)

In [None]:
total_rooms[:3]

array([5612., 7650.,  720.])

In [None]:
print(min(total_rooms))
print(max(total_rooms))

2.0
37937.0


In [None]:
num_persons_living = np.full((17000, 1), 4)
num_persons_living

array([[4],
       [4],
       [4],
       ...,
       [4],
       [4],
       [4]])

In [None]:
# concatenate a new column to existing data
num_persons_living = np.full((17000, 1), 4)

filedata = np.hstack((filedata, num_persons_living))
print(filedata.shape)

(17000, 10)
