NumPy - Specialized library for doing numerical computations and mathematical operations. (eg - Dot Product, Matrix Multiplication etc).

Official Documentation - https://numpy.org/doc/stable/reference/index.html

In [167]:
import numpy as np

Numpy array - a list like data structure

In [3]:
kanto = np.array([73, 67, 43])
kanto

array([73, 67, 43])

In [4]:
kanto[0]           #Supports 0 based indexing

73

In [5]:
weights = np.array([0.3, 0.2, 0.5])
weights

array([0.3, 0.2, 0.5])

In [6]:
type(kanto)

numpy.ndarray

Operations on numpy arrays

In [7]:
#Dot product

np.dot(kanto, weights)

56.8

In [8]:
np.dot([True, False], [2,3])       #True = 1, False = 0

2

In [9]:
# * - Element wise multiplication between 2 vectors
res = kanto * weights
res

array([21.9, 13.4, 21.5])

In [10]:
# Sum of elements in an array
res.sum()

56.8

Benefits of Numpy over python lists

1. Ease of use - numpy is built for performing mathematical computations
2. Performance boost - Numpy operations are implemented internally using C++ which is compiled to assembly. This makes it much faster than python statements and loops which are interpreted at runtime

In [11]:
l1 = list(range(0, 1000000))
l2 = list(range(1000000, 2000000))

nparr1 = np.array(l1)
nparr2 = np.array(l2)

In [13]:
%%time
res = 0
for x,y in zip(l1, l2):
    res += x*y

res

CPU times: user 164 ms, sys: 36.9 ms, total: 201 ms
Wall time: 212 ms


833332333333500000

In [17]:
%%time
np.dot(nparr1, nparr2)               #np.dot() is ~100 times faster than the equivalent python logic

CPU times: user 2.03 ms, sys: 5.72 ms, total: 7.74 ms
Wall time: 6.24 ms


833332333333500000

In [16]:
%%time
(nparr1 * nparr2).sum()

CPU times: user 2.75 ms, sys: 3.99 ms, total: 6.74 ms
Wall time: 8.51 ms


833332333333500000

Multidimensional Arrays

In [None]:
# 2-d arrays (list of lists) - Matrix

climate_data = np.array([[73, 67, 43],
                         [91, 88, 64],
                         [87, 134, 58],
                         [102, 43, 37],
                         [69, 96, 70]
                        ])

In [19]:
climate_data

array([[ 73,  67,  43],
       [ 91,  88,  64],
       [ 87, 134,  58],
       [102,  43,  37],
       [ 69,  96,  70]])

In [22]:
# 3-d arrays (list of matrices)

arr3 = np.array([
               [[1,1],[1,1]],
               [[10, 20],[30, 40]]
])

arr3

array([[[ 1,  1],
        [ 1,  1]],

       [[10, 20],
        [30, 40]]])

In [26]:
#Getting dimension(also called axes in numpy terminology) info of an np array - use shape attribute
climate_data.shape

(5, 3)

In [24]:
arr3.shape

(2, 2, 2)

In [27]:
mat = np.array([[1,2],
                [4]])

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.

In [29]:
#All the elements in an np array have the same data type. 
# You can check the datatype of an array using .dtype attribute

print(kanto)
print(kanto.dtype)

print(weights)
print(weights.dtype)

[73 67 43]
int64
[0.3 0.2 0.5]
float64


In [34]:
a1 = np.array([1,'a',2])       #Numpy tries to make a homogenous array by upcasting elements to a common compatible type
print(a1)
print(a1.dtype)

['1' 'a' '2']
<U21


In [None]:
a2 = np.array([1, 5.3])              
print(a2)
print(a2.dtype)

[1.  5.3]
float64


Matrix Multiplication - In numpy this can be done using the np.matmul() method or by using the @ operator

In [35]:
climate_data

array([[ 73,  67,  43],
       [ 91,  88,  64],
       [ 87, 134,  58],
       [102,  43,  37],
       [ 69,  96,  70]])

In [36]:
weights

array([0.3, 0.2, 0.5])

In [37]:
np.matmul(climate_data, weights)

array([56.8, 76.9, 81.9, 57.7, 74.9])

In [38]:
climate_data @ weights

array([56.8, 76.9, 81.9, 57.7, 74.9])

In [39]:
[[1,1],[1,1]] @ [[1,1],[1,1]]

TypeError: unsupported operand type(s) for @: 'list' and 'list'

In [41]:
from urllib import request

In [42]:
request.urlretrieve('https://gist.github.com/BirajCoder/a4ffcb76fd6fb221d76ac2ee2b8584e9/raw/4054f90adfd361b7aa4255e99c2e874664094cea/climate.csv', 
    'climate.csv')

('climate.csv', <http.client.HTTPMessage at 0x10df83b10>)

In [45]:
#np.genfromtxt() method can be used to store structured data from a txt, csv file in a numpy array
climate_data = np.genfromtxt('climate.csv', delimiter=',', skip_header=True)

In [46]:
climate_data

array([[25., 76., 99.],
       [39., 65., 70.],
       [59., 45., 77.],
       ...,
       [99., 62., 58.],
       [70., 71., 91.],
       [92., 39., 76.]])

In [48]:
climate_data.shape

(10000, 3)

In [49]:
yields = climate_data @ weights
yields

array([72.2, 59.7, 65.2, ..., 71.1, 80.7, 73.4])

In [50]:
yields.shape

(10000,)

np.concatenate - Concatenates a tuple of arrays into along an existing axis

In [51]:
help(np.concatenate)

Help on function concatenate in module numpy:

concatenate(...)
    concatenate((a1, a2, ...), axis=0, out=None, dtype=None, casting="same_kind")
    
    Join a sequence of arrays along an existing axis.
    
    Parameters
    ----------
    a1, a2, ... : sequence of array_like
        The arrays must have the same shape, except in the dimension
        corresponding to `axis` (the first, by default).
    axis : int, optional
        The axis along which the arrays will be joined.  If axis is None,
        arrays are flattened before use.  Default is 0.
    out : ndarray, optional
        If provided, the destination to place the result. The shape must be
        correct, matching that of what concatenate would have returned if no
        out argument were specified.
    dtype : str or dtype
        If provided, the destination array will have this dtype. Cannot be
        provided together with `out`.
    
        .. versionadded:: 1.20.0
    
    casting : {'no', 'equiv', 'safe', '

In [61]:
climate_results = np.concatenate((climate_data, yields.reshape(10000, 1)), axis=1)

In [62]:
climate_results

array([[25. , 76. , 99. , 72.2],
       [39. , 65. , 70. , 59.7],
       [59. , 45. , 77. , 65.2],
       ...,
       [99. , 62. , 58. , 71.1],
       [70. , 71. , 91. , 80.7],
       [92. , 39. , 76. , 73.4]])

np.reshape() - Changes the shape (dimensions) of the given array without changing the data. The new shape must be compatible with the original shape

In [63]:
help(np.reshape)

Help on function reshape in module numpy:

reshape(a, newshape, order='C')
    Gives a new shape to an array without changing its data.
    
    Parameters
    ----------
    a : array_like
        Array to be reshaped.
    newshape : int or tuple of ints
        The new shape should be compatible with the original shape. If
        an integer, then the result will be a 1-D array of that length.
        One shape dimension can be -1. In this case, the value is
        inferred from the length of the array and remaining dimensions.
    order : {'C', 'F', 'A'}, optional
        Read the elements of `a` using this index order, and place the
        elements into the reshaped array using this index order.  'C'
        means to read / write the elements using C-like index order,
        with the last axis index changing fastest, back to the first
        axis index changing slowest. 'F' means to read / write the
        elements using Fortran-like index order, with the first index
        c

In [65]:
yields

array([[72.2, 59.7, 65.2, ..., 71.1, 80.7, 73.4]])

In [64]:
yields.shape

(1, 10000)

In [67]:
np.reshape(yields, (5000, 2))

array([[72.2, 59.7],
       [65.2, 56.8],
       [55.8, 69.6],
       ...,
       [56.2, 87.4],
       [49.7, 71.1],
       [80.7, 73.4]])

In [69]:
yields.reshape(10000, 1)

array([[72.2],
       [59.7],
       [65.2],
       ...,
       [71.1],
       [80.7],
       [73.4]])

In [71]:
yields

array([[72.2, 59.7, 65.2, ..., 71.1, 80.7, 73.4]])

In [70]:
yields.shape

(1, 10000)

In [75]:
np.savetxt('climate_results.csv', climate_results, fmt='%.2f', header='Temp, Rainfall, Humidity, Yield_apples', comments='')

Arithmetic Operators: Numpy arrays support arithmetic opertors such as +, -, *, /, %, ^. These operations are done element wise (so even you do array op scalar, this operation is done on each element)

In [79]:
arr1 = np.array([[1, 1],
                 [1, 1]])

arr2 = np.array([[1, 1],
                 [1, 1]])

In [80]:
arr1 + arr2

array([[2, 2],
       [2, 2]])

In [None]:
arr1 * arr2              # * does not do matmul

array([[1, 1],
       [1, 1]])

In [82]:
arr1 / arr2

array([[1., 1.],
       [1., 1.]])

In [83]:
arr1 - arr2

array([[0, 0],
       [0, 0]])

In [84]:
arr1 ** arr2

array([[1, 1],
       [1, 1]])

In [86]:
arr1 + 2

array([[3, 3],
       [3, 3]])

In [87]:
arr1 - 1

array([[0, 0],
       [0, 0]])

In [88]:
arr1 * 3

array([[3, 3],
       [3, 3]])

In [89]:
arr1 / 2

array([[0.5, 0.5],
       [0.5, 0.5]])

In [92]:
arr1 % 1

array([[0, 0],
       [0, 0]])

In [93]:
arr1 ** 3

array([[1, 1],
       [1, 1]])

In [94]:
arr3 = np.array([[1, 1, 1],
                 [1, 1, 1]])

arr3.shape

(2, 3)

In [95]:
arr1.shape

(2, 2)

In [96]:
arr1+ arr3

ValueError: operands could not be broadcast together with shapes (2,2) (2,3) 

Broadcasting

When the expression `arr4 + arr5` is evaluated, `arr5` (which has the shape `(4,)`) is replicated three times to match the shape `(3, 4)` of `arr4`. Numpy performs the replication without actually creating three copies of the smaller dimension array, thus improving performance and using lower memory.

<img src="https://jakevdp.github.io/PythonDataScienceHandbook/figures/02.05-broadcasting.png" width="360">

Broadcasting only works if one of the arrays can be replicated to match the other array's shape.

In [102]:
arr4 = np.array([[1, 2, 3, 4], 
                 [5, 6, 7, 8], 
                 [9, 1, 2, 3]])

arr4.shape

(3, 4)

In [103]:
arr5 = np.array([10, 10, 10, 10])
arr5.shape

(4,)

In [101]:
arr4 + arr5

array([[11, 12, 13, 14],
       [15, 16, 17, 18],
       [19, 11, 12, 13]])

In [105]:
arr6 = np.array([100, 200, 300])
arr6.shape

(3,)

In [106]:
arr4 + arr6

ValueError: operands could not be broadcast together with shapes (3,4) (3,) 

In [107]:
arr4 + arr6.reshape(3,1)

array([[101, 102, 103, 104],
       [205, 206, 207, 208],
       [309, 301, 302, 303]])

Comparision operators with numoy arrays - Numpy arrays also support comparision operators. These perform element wise comparision and return an array of booleans

In [108]:
arr1

array([[1, 1],
       [1, 1]])

In [110]:
arr2 = np.array([
                [1,0],
                [0,1]
                ])

arr2

array([[1, 0],
       [0, 1]])

In [111]:
arr1 == arr2

array([[ True, False],
       [False,  True]])

In [112]:
arr1 > arr2

array([[False,  True],
       [ True, False]])

In [113]:
arr1 < arr2

array([[False, False],
       [False, False]])

In [114]:
arr1 != arr2

array([[False,  True],
       [ True, False]])

In [115]:
(arr1 == arr2).sum()

2

Indexing and Slicing.

numpy extends Pythons list indexing notation []. you can provide a list of , separated indcies or ranges across each dimension of the array to select a specific element from the array or a sub array (i.e slice)

In [116]:
arr6 = np.array([
    [[11, 12, 13, 14], 
     [13, 14, 15, 19]], 
    
    [[15, 16, 17, 21], 
     [63, 92, 36, 18]], 
    
    [[98, 32, 81, 23],      
     [17, 18, 19.5, 43]]])

In [117]:
arr6.shape

(3, 2, 4)

In [123]:
# Single element
arr6[1, 1, 2]

36.0

In [125]:
# Subarray using ranges
arr6[1:, 0:1, :2]                #arr6[1:] - selects all matrices from index 1
                                  # arr6[, 0:1] - from the selected matrices, selects the 0th rows
                                  # arr6[,, :2] - from the selected rows, selects the 0th and the 1st column elements

array([[[15., 16.]],

       [[98., 32.]]])

In [126]:
#Slicing preserves the number of dimensions of the original array
arr6[1:, 0:1, :2].shape              #arr6 also had 3 dimensions

(2, 1, 2)

In [129]:
# Mixing indices and ranges
arr6[1:, 1, 3]

array([18., 43.])

In [130]:
# Mixing indices and ranges
arr6[1:, 1, :3]

array([[63. , 92. , 36. ],
       [17. , 18. , 19.5]])

In [131]:
# Using fewer indices
arr6[1]

array([[15., 16., 17., 21.],
       [63., 92., 36., 18.]])

In [132]:
# Using fewer indices
arr6[:2, 1]

array([[13., 14., 15., 19.],
       [63., 92., 36., 18.]])

In [133]:
# Using too many indices
arr6[1,3,2,1]

IndexError: too many indices for array: array is 3-dimensional, but 4 were indexed

The notation and its results can seem confusing at first, so take your time to experiment and become comfortable with it. Use the cells below to try out some examples of array indexing and slicing, with different combinations of indices and ranges. Here are some more examples demonstrated visually:

<img src="https://scipy-lectures.org/_images/numpy_indexing.png" width="360">

In [134]:
arr7 = np.array([
                 [0,1,2,3,4,5]
                ,[10,11,12,13,14,15]
                ,[20,21,22,23,24,25]
                ,[30,31,32,33,34,35]
                ,[40,41,42,43,44,45]
                ,[50,51,52,53,54,55]
                ])

arr7

array([[ 0,  1,  2,  3,  4,  5],
       [10, 11, 12, 13, 14, 15],
       [20, 21, 22, 23, 24, 25],
       [30, 31, 32, 33, 34, 35],
       [40, 41, 42, 43, 44, 45],
       [50, 51, 52, 53, 54, 55]])

In [135]:
arr7.shape

(6, 6)

In [136]:
arr7[0, 3:5]

array([3, 4])

In [137]:
arr7[:, 2]

array([ 2, 12, 22, 32, 42, 52])

In [138]:
arr7[4:, 4:]

array([[44, 45],
       [54, 55]])

In [139]:
arr7[2:5:2, 0:5:2]

array([[20, 22, 24],
       [40, 42, 44]])

Other useful array creation methods - Numpy provides some useful methods to create arrays of desired shape with fixed or random values.

Official Doc - https://numpy.org/doc/stable/reference/routines.array-creation.html

In [145]:
# All zeros
np.zeros((4,2), dtype=np.int64)

array([[0, 0],
       [0, 0],
       [0, 0],
       [0, 0]])

In [146]:
# All ones
np.ones((3,2))

array([[1., 1.],
       [1., 1.],
       [1., 1.]])

In [149]:
# Identity Matrix
np.eye(2)

array([[1., 0.],
       [0., 1.]])

In [None]:
np.eye(2,3,)

array([[1., 0., 0.],
       [0., 1., 0.]])

In [154]:
#Random Vector
np.random.rand(5)

array([0.8620874 , 0.74697118, 0.85294772, 0.72295869, 0.62946448])

In [155]:
#Random Matrix
np.random.rand(2,3)

array([[0.86790558, 0.52865617, 0.81488127],
       [0.07317612, 0.65723649, 0.99492098]])

In [None]:
#Random Vector - uses Standard Normal Distribuion
np.random.randn(5)

array([ 0.71299907, -1.57390205,  1.85575252, -1.32909232,  2.17480131])

In [157]:
#Random Matrix - uses Standard Normal Distribuion
np.random.rand(2,3)

array([[0.06953603, 0.43700339, 0.87006221],
       [0.90142402, 0.90535496, 0.85639365]])

In [159]:
#Random Matrix of integers
np.random.randint(low=3, high=6, size=(3,2))

array([[4, 4],
       [3, 3],
       [3, 5]])

In [160]:
# Fixed value
np.full((2, 3), 9)

array([[9, 9, 9],
       [9, 9, 9]])

In [161]:
# Range with start, end and step
np.arange(1, 10, 3)

array([1, 4, 7])

In [164]:
# Equally spaced numbers in a range
a = np.linspace(2,32,10)         # The num argument specifies the number of values to be generated within the range

In [165]:
a.shape

(10,)

In [166]:
a.reshape(2,5)

array([[ 2.        ,  5.33333333,  8.66666667, 12.        , 15.33333333],
       [18.66666667, 22.        , 25.33333333, 28.66666667, 32.        ]])