In [31]:
import numpy as np

### 1 Aggregation
---
- in this section we discuss aggregation and a number of aggregation functions in numpy

#### 1.1 Aggregation Functions

In [32]:
x = np.arange(12).reshape(3,4)
x

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [33]:
# np.sum will simply sum all values in the array
np.sum(x)

np.int64(66)

In [34]:
# we can also sum along a specified axis
np.sum(x, axis=0)
    # axis = 0 means we sum every element in each column leaving us with a single row at the end
    # axis = 0 means we are aggregating the rows into 1

array([12, 15, 18, 21])

In [35]:
# we can also aggregate the columns into a single column
# if we use axis = 1, we sum each element in every row leaving a single column at the end
np.sum(x, axis=1)

array([ 6, 22, 38])

- aggregations will always drop the resulting dimension of size=1 (the dimension along which the aggregation was performed) 

#### 1.2 Other Aggregations

In [36]:
#### a different way to use numpy sum function
x.sum(axis=1).reshape((x.shape[0],1))

array([[ 6],
       [22],
       [38]])

In [37]:
# returns the max of the array
x.max()

np.int64(11)

In [38]:
# returns the maxes along the aggregated dimension
print(f"{x}\n->\n{x.max(axis=1).reshape((x.shape[0],1))}") # here we aggregate the columns into a single column containing the max for each row

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
->
[[ 3]
 [ 7]
 [11]]


In [39]:
print(f"{x}\n->\n{x.min(axis=0).reshape((1,x.shape[1]))}") # notice how the rows have been collapsed (agregated)
                                                           # this is because we aggregated along axis=0 (dimension 0)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
->
[[0 1 2 3]]


In [40]:
print(f"{x}\n->\n{x.prod(axis=1).reshape((x.shape[0],1))}") # reshaping is to make the dimension collapse more obvious
                                                            # all dimenions retain their original size
                                                            # aggregation dimension reshaped to size 1

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
->
[[   0]
 [ 840]
 [7920]]


In [41]:
x.any() # collapses the entire ndarray down into a single value
        # returns true if there is even a single nonzero(non boolean false) value

np.True_

In [42]:
print(f"{x}\n->\n{x.any(axis=1).reshape((x.shape[0],1))}") # collapses all columns into a single column
                                                           # there is a true value if any element in the previous row was nonzero

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
->
[[ True]
 [ True]
 [ True]]


In [43]:
x.all() # returns true iff every element in the array is nonzero

np.False_

In [44]:
# collapses all columns into a single column, true vlaue if all nonzero elements in previous row
print(f"{x}\n->\n{x.all(axis=1).reshape((x.shape[0],1))}") 


[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
->
[[False]
 [ True]
 [ True]]


In [45]:
print(f"{x}\n->\n{x.all(axis=0).reshape((1,x.shape[1]))}") # collapses all rows into a single row
                                                           # true value only if all entries in the column were nonzero

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
->
[[False  True  True  True]]


#### 1.3 Statistical metrics as aggregations

In [46]:
# same idea as above operations, these aggregations are just core statistical metrics
print(f"{x}\n->\n{x.mean(axis=0).reshape((1,x.shape[1]))}\n") # average of each column
print(f"{x}\n->\n{np.median(x,axis=1).reshape((x.shape[0], 1))}\n") # median of each row (middle value along the columns)
print(f"{x}\n->\n{np.percentile(x,[25,50,75],axis=1)}\n") # finds the 25th, 50th and 75th percentile values along each row
                                                          # percentiles appear along columns of output array by default
print(f"{x}\n->\n{np.std(x,axis=1).reshape((x.shape[0], 1))}\n") # find the standard deviation for each row
print(f"{x}\n->\n{np.var(x,axis=1).reshape((x.shape[0], 1))}\n") # find the variance for each row


[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
->
[[4. 5. 6. 7.]]

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
->
[[1.5]
 [5.5]
 [9.5]]

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
->
[[ 0.75  4.75  8.75]
 [ 1.5   5.5   9.5 ]
 [ 2.25  6.25 10.25]]

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
->
[[1.11803399]
 [1.11803399]
 [1.11803399]]

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
->
[[1.25]
 [1.25]
 [1.25]]



#### 1.4 Argmin and Argmax

In [47]:
x = np.random.uniform(0, 1, (3,4)).round(2)
x

array([[0.9 , 0.22, 0.02, 0.12],
       [0.2 , 0.33, 0.41, 0.94],
       [0.14, 0.8 , 0.65, 0.88]])

In [48]:
# we can return in the index of the minimum or maximum value
print(x.argmin())
print(x.argmax())

# when calling argmin or argmax on the entire array, python will internally flatten the array and return the index of the
# max/min value of the flattened array
# the original index can be returned using np.unravel(max/minindex, originalshape)

print(f"index of max element in flattened array = {x.argmax()}")
print(f"max element coordinates in unflattened array = {np.unravel_index(x.argmax(), x.shape)}")
 

2
7
index of max element in flattened array = 7
max element coordinates in unflattened array = (np.int64(1), np.int64(3))


In [49]:
# we can call argmin or argmax along a specific dimension
print(f"{x}\n->\n{x.argmin(axis=1).reshape((x.shape[0], 1))}\n") # returns the index of the min element for each row


[[0.9  0.22 0.02 0.12]
 [0.2  0.33 0.41 0.94]
 [0.14 0.8  0.65 0.88]]
->
[[2]
 [0]
 [0]]



#### 2 Selection using boolean arrays

To illustrate the concepts of boolean arrays and how to use them for selection, let’s consider an example.

Suppose we use the performance of 5 students over three different subjects:

| Index |Name     | Math | CS  | Biology |
|-------|---------|------|-----|---------|
| 0.    |Jack     | 90   | 80  | 75      |
| 1.    |Jill     | 93   | 89  | 87      |
| 2.    |Joe      | 67   | 98. | 88      |
| 3.    |Jason    | 77.  | 89. | 80      |
| 4.    |Jennifer | 83.  | 70. | 95      |

In [50]:
grades = np.array([
    [90, 80, 75],
    [93, 95, 87],
    [67, 98, 88],
    [77, 89, 80],
    [93, 97, 95],
])

names = np.array([
    'Jack',
    'Jill',
    'Joe',
    'Jason',
    'Jennifer',
])

- by applying logical operators and formulating a predicate involving numpy arrays we can derive a boolean mask (boolean array)

    - == equality

    - <, >, <=, >=

    - np.logical_not

    - & and |

- such a mask can be used to index an array and extract entries corresponding to the true values

In [51]:
# here are all the math grades
grades[:, 0]

array([90, 93, 67, 77, 93])

In [52]:
# this will generate a boolean mask for all the people who recieved a grade higher 90 in math
mask = (grades[:,0] > 90).reshape((5, 1))
mask

array([[False],
       [ True],
       [False],
       [False],
       [ True]])

In [53]:
# the boolean mask can actually be used as an index to extract student names
    # this assumes the name and grade entries are indexed consistently

names[mask[:,0]].reshape((2,1))

array([['Jill'],
       ['Jennifer']], dtype='<U8')

In [54]:
# mask for A+ in math and cs
mask = ((grades[:,0] >= 90) & (grades[:,1] >= 90))

In [55]:
# extracting the names of said students
names[mask].reshape((2,1))

array([['Jill'],
       ['Jennifer']], dtype='<U8')

In [56]:
# getting the names of all students that got >= 90 in math and CS but not biology
mask = ((grades[:,0] >= 90) & (grades[:,1] >= 90) & np.logical_not(grades[:,2] >= 90))
names[mask]

array(['Jill'], dtype='<U8')

In [57]:
# here is a more readable approach
# getting all the kids that got less than 70 in any course
names[
    (grades[:,0]<70) |
    (grades[:,1]<70) |
    (grades[:,2]<70)
]

array(['Joe'], dtype='<U8')

In [58]:
# now we can get all of this student's grades using a mask on the grades array
    # the mask would have to be applied on the ROWS of the grades array
    # this would return a subarray of the same dimension as the original containing a single row in this case
    # to understand why, consider what would happen if there were mutiple students, there would need to be multiple rows
    # a list of multiple rows is a list of lists which is a numpy 2d array

# here we squeeze to remove the unecessary dimension
joes_grades = grades[names == 'Joe',:].squeeze()

print(f"Joe recieved a {joes_grades[0]} in math, {joes_grades[1]} in CS, and {joes_grades[2]} in biology")

Joe recieved a 67 in math, 98 in CS, and 88 in biology


In [62]:
grades[names == 'Joe',:].shape

(1, 3)