In [1]:
import numpy as np

### 1 Aggregation
---
- in this section we discuss aggregation and a number of aggregation functions in numpy

#### 1.1 Aggregation Functions

In [4]:
x = np.arange(12).reshape(3,4)
x

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [5]:
# np.sum will simply sum all values in the array
np.sum(x)

np.int64(66)

In [6]:
# we can also sum along a specified axis
np.sum(x, axis=0)
    # axis = 0 means we sum every element in each column leaving us with a single row at the end
    # axis = 0 means we are aggregating the rows into 1

array([12, 15, 18, 21])

In [7]:
# we can also aggregate the columns into a single column
# if we use axis = 1, we sum each element in every row leaving a single column at the end
np.sum(x, axis=1)

array([ 6, 22, 38])

- aggregations will always drop the resulting dimension of size=1 (the dimension along which the aggregation was performed) 

#### 1.2 Other Aggregations

In [8]:
#### a different way to use numpy sum function
x.sum(axis=1).reshape((x.shape[0],1))

array([[ 6],
       [22],
       [38]])

In [9]:
# returns the max of the array
x.max()

np.int64(11)

In [13]:
# returns the maxes along the aggregated dimension
print(f"{x}\n->\n{x.max(axis=1).reshape((x.shape[0],1))}") # here we aggregate the columns into a single column containing the max for each row

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
->
[[ 3]
 [ 7]
 [11]]


In [16]:
print(f"{x}\n->\n{x.min(axis=0).reshape((1,x.shape[1]))}") # notice how the rows have been collapsed (agregated)
                                                           # this is because we aggregated along axis=0 (dimension 0)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
->
[[0 1 2 3]]


In [17]:
print(f"{x}\n->\n{x.prod(axis=1).reshape((x.shape[0],1))}") # reshaping is to make the dimension collapse more obvious
                                                            # all dimenions retain their original size
                                                            # aggregation dimension reshaped to size 1

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
->
[[   0]
 [ 840]
 [7920]]


In [22]:
x.any() # collapses the entire ndarray down into a single value
        # returns true if there is even a single nonzero(non boolean false) value

np.True_

In [23]:
print(f"{x}\n->\n{x.any(axis=1).reshape((x.shape[0],1))}") # collapses all columns into a single column
                                                           # there is a true value if any element in the previous row was nonzero

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
->
[[ True]
 [ True]
 [ True]]


In [24]:
x.all() # returns true iff every element in the array is nonzero

np.False_

In [25]:
print(f"{x}\n->\n{x.all(axis=0).reshape((1,x.shape[1]))}") # collapses all rows into a single row
                                                           # true value only if all entries in the column were nonzero

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
->
[[False  True  True  True]]


#### 1.3 Statistical metrics as aggregations

In [41]:
# same idea as above operations, these aggregations are just core statistical metrics
print(f"{x}\n->\n{x.mean(axis=0).reshape((1,x.shape[1]))}\n") # average of each column
print(f"{x}\n->\n{np.median(x,axis=1).reshape((x.shape[0], 1))}\n") # median of each row (middle value along the columns)
print(f"{x}\n->\n{np.percentile(x,[25,50,75],axis=1)}\n") # finds the 25th, 50th and 75th percentile values along each row
                                                          # percentiles appear along columns of output array by default
print(f"{x}\n->\n{np.std(x,axis=1).reshape((x.shape[0], 1))}\n") # find the standard deviation for each row
print(f"{x}\n->\n{np.var(x,axis=1).reshape((x.shape[0], 1))}\n") # find the variance for each row


[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
->
[[4. 5. 6. 7.]]

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
->
[[1.5]
 [5.5]
 [9.5]]

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
->
[[ 0.75  4.75  8.75]
 [ 1.5   5.5   9.5 ]
 [ 2.25  6.25 10.25]]

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
->
[[1.11803399]
 [1.11803399]
 [1.11803399]]

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
->
[[1.25]
 [1.25]
 [1.25]]



#### 1.4 Argmin and Argmax

In [42]:
x = np.random.uniform(0, 1, (3,4)).round(2)
x

array([[0.53, 0.78, 0.85, 0.27],
       [0.71, 0.1 , 0.7 , 0.29],
       [0.97, 0.84, 0.8 , 0.77]])

In [45]:
# we can return in the index of the minimum or maximum value
print(x.argmin())
print(x.argmax())

# when calling argmin or argmax on the entire array, python will internally flatten the array and return the index of the
# max/min value of the flattened array
# the original index can be returned using np.unravel(max/minindex, originalshape)

print(f"index of max element in flattened array = {x.argmax()}")
print(f"max element coordinates in unflattened array = {np.unravel_index(x.argmax(), x.shape)}")
 

5
8
index of max element in flattened array = 8
max element coordinates in unflattened array = (np.int64(2), np.int64(0))


In [46]:
# we can call argmin or argmax along a specific dimension
print(f"{x}\n->\n{x.argmin(axis=1).reshape((x.shape[0], 1))}\n") # returns the index of the min element for each row


[[0.53 0.78 0.85 0.27]
 [0.71 0.1  0.7  0.29]
 [0.97 0.84 0.8  0.77]]
->
[[3]
 [1]
 [3]]

