For additional information, see Python Data Science Handbook chapter 2


In [1]:
import numpy as np

# Basic Math

In [65]:
x = np.arange(4)
print(x)

[0 1 2 3]


In [3]:
print(x + 5)

[5 6 7 8]


In [4]:
print(x - 5)

[-5 -4 -3 -2]


In [5]:
print(x * 2)

[0 2 4 6]


In [6]:
print(x / 2)

[0.  0.5 1.  1.5]


In [7]:
print(-x)

[ 0 -1 -2 -3]


In [8]:
print(x ** 2)

[0 1 4 9]


In [9]:
print(x % 2) # modulo division

[0 1 0 1]


In [10]:
print(abs(x)) # abs

[0 1 2 3]


# Trig functions
note that the functions are preceeded by np.

In [12]:
theta = np.linspace(0, np.pi, 3)
print(theta)

[0.         1.57079633 3.14159265]


In [14]:
print(np.sin(theta))

[0.0000000e+00 1.0000000e+00 1.2246468e-16]


In [15]:
print(np.cos(theta))

[ 1.000000e+00  6.123234e-17 -1.000000e+00]


In [16]:
print(np.tan(theta))

[ 0.00000000e+00  1.63312394e+16 -1.22464680e-16]


# Log and Exp

In [18]:
x = np.array([1, 10, 100])
print(np.log(x))   # natural log
print(np.log10(x)) # common log

[0.         2.30258509 4.60517019]
[0. 1. 2.]


In [20]:
y = np.arange(3)
print(np.exp(y))  # e^y

[1.         2.71828183 7.3890561 ]


In [21]:
print(np.exp2(y))  # 2^y

[1. 2. 4.]


In [23]:
print(np.power(3, y)) # power ^ y

[1 3 9]


# Aggregates

you can use `sum()`

or `np.sum()`

`np.sum()` is faster than sum, but doesn't always behave the same way

In [26]:
x = np.arange(100)
print(sum(x))

4950


In [27]:
print(np.sum(x))

4950


In [30]:
big_array = np.random.rand(10000)
%timeit sum(big_array)
%timeit np.sum(big_array)  # the np version is much faster

1.4 ms ± 21.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
6.45 µs ± 127 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## min and max

In [31]:
print(min(big_array))
print(max(big_array))

4.461560759394523e-05
0.9999866903193898


In [32]:
print(np.min(big_array))
print(np.max(big_array))

4.461560759394523e-05
0.9999866903193898


In [33]:
%timeit min(big_array)
%timeit np.min(big_array)  # the np version is much faster

572 µs ± 21.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
5.88 µs ± 41.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## summaries for matrices

In [48]:
M = np.random.random((3, 4))
print(M)

[[0.00128174 0.82425739 0.21038845 0.16056064]
 [0.51324753 0.9140029  0.63573812 0.59154381]
 [0.63820627 0.44657141 0.54855768 0.43981146]]


In [49]:
sum(M) # regular sum function

array([1.15273554, 2.18483169, 1.39468424, 1.19191591])

In [50]:
np.sum(M) # np.sum function

5.924167390501056

In [52]:
np.sum(M, axis = 0)  # np.sum function with axis specified

array([1.15273554, 2.18483169, 1.39468424, 1.19191591])

In [53]:
np.min(M, axis = 0)

array([0.00128174, 0.44657141, 0.21038845, 0.16056064])

## dealing with nan or none
nan is the float value for something that is not a number

In [55]:
x = float("nan")
print(x)
print(type(x))

nan
<class 'float'>


In [62]:
np.sum([x, 2])

nan

In [63]:
np.nansum([x, 2])

2.0

The following table provides a list of useful aggregation functions available in NumPy:

|Function Name      |   NaN-safe Version  | Description                                   |
|-------------------|---------------------|-----------------------------------------------|
| ``np.sum``        | ``np.nansum``       | Compute sum of elements                       |
| ``np.prod``       | ``np.nanprod``      | Compute product of elements                   |
| ``np.mean``       | ``np.nanmean``      | Compute mean of elements                      |
| ``np.std``        | ``np.nanstd``       | Compute standard deviation                    |
| ``np.var``        | ``np.nanvar``       | Compute variance                              |
| ``np.min``        | ``np.nanmin``       | Find minimum value                            |
| ``np.max``        | ``np.nanmax``       | Find maximum value                            |
| ``np.argmin``     | ``np.nanargmin``    | Find index of minimum value                   |
| ``np.argmax``     | ``np.nanargmax``    | Find index of maximum value                   |
| ``np.median``     | ``np.nanmedian``    | Compute median of elements                    |
| ``np.percentile`` | ``np.nanpercentile``| Compute rank-based statistics of elements     |
| ``np.any``        | N/A                 | Evaluate whether any elements are true        |
| ``np.all``        | N/A                 | Evaluate whether all elements are true        |

## Broadcasting

This is a similar concept to recyling values in R, but only works when the dimensions are compatible

In [67]:
a = np.array([1,2,3])
b = np.array([4,5,6])
a + b

array([5, 7, 9])

In [68]:
c = np.array([7,8])
a + c  # doesn't work

ValueError: operands could not be broadcast together with shapes (3,) (2,) 

In [73]:
print(a)

[1 2 3]


In [69]:
e = np.ones([3,3])
print(e)

[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]


In [71]:
print(e + a)

[[2. 3. 4.]
 [2. 3. 4.]
 [2. 3. 4.]]


In [72]:
print(e + a.reshape([3,1]))

[[2. 2. 2.]
 [3. 3. 3.]
 [4. 4. 4.]]


In [76]:
d = np.vstack([a,b])
print(d)

[[1 2 3]
 [4 5 6]]


In [78]:
d + a

array([[2, 4, 6],
       [5, 7, 9]])

In [80]:
print(c)

[7 8]


In [81]:
d + c

ValueError: operands could not be broadcast together with shapes (2,3) (2,) 

In [85]:
d + c.reshape([2,1])

array([[ 8,  9, 10],
       [12, 13, 14]])

In [88]:
e = np.arange(3).reshape((3, 1))
f = np.arange(3)
print(e)
print(f)

[[0]
 [1]
 [2]]
[0 1 2]


In [92]:
print(e + f)  ## e and f are broadcast into compatible matrices and then added

[[0 1 2]
 [1 2 3]
 [2 3 4]]


# Boolean Operators in NumPy

In [93]:
x = np.arange(6)
print(x)

[0 1 2 3 4 5]


In [94]:
x < 3

array([ True,  True,  True, False, False, False])

In [95]:
x >= 3

array([False, False, False,  True,  True,  True])

In [96]:
x == 3

array([False, False, False,  True, False, False])

In [99]:
# the results can then be used to subset
print(x[x >= 3])

[3 4 5]


In [102]:
np.sum(x >= 3) # True = 1, False = 0, so sum counts how many are true

3

In [103]:
np.mean(x >= 3)

0.5

In [104]:
y = np.arange(12).reshape([3,4])
print(y)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


In [109]:
print(y >= 6)

[[False False False False]
 [False False  True  True]
 [ True  True  True  True]]


In [106]:
np.sum(y >= 6)

6

In [107]:
np.sum(y >= 6, axis = 0)

array([1, 1, 2, 2])

In [108]:
np.sum(y >= 6, axis = 1)

array([0, 2, 4])