In [1]:
# Wehn faced with large amount of data, first step is to compute summary statistics
# Ex. mean, standard deviation, sum, product, median, min, max, etc.
# NumPy has fast, built-in aggregation functionsfor arrays
# Lets sum up all the values in an array using Python's sum function
import numpy as np
L = np.random.random(100)
sum(L)

50.84512138127019

In [2]:
# Similar to NumPy's sum function, results are the same
np.sum(L)

50.8451213812702

In [3]:
# However, since NumPy executes the operation in compiled code, its version of the operation computes more quickly
big_array = np.random.rand(1000000)
%timeit sum(big_array)
%timeit np.sum(big_array)

10 loops, best of 3: 65.6 ms per loop
1000 loops, best of 3: 638 µs per loop


In [None]:
# 'sum' function and 'np.sum' function is NOT IDENTICAL
# Their optional arguments have different meanings

In [4]:
# Similarily, Python has built-in min and max functions 
min(big_array), max(big_array)

(2.271358412464508e-07, 0.9999979537672631)

In [5]:
# NumPy also have these functions and again operate more quickly
np.min(big_array), np.max(big_array)

(2.271358412464508e-07, 0.9999979537672631)

In [6]:
%timeit min(big_array)
%timeit np.min(big_array)

10 loops, best of 3: 41.5 ms per loop
1000 loops, best of 3: 298 µs per loop


In [8]:
# For many NumPy aggregates, shorter syntax is to use methods of array object itself
print(big_array.min(), big_array.max(), big_array.sum())

(2.271358412464508e-07, 0.9999979537672631, 499975.5353576838)


In [9]:
# Common type of aggregation operation is an aggregate along row/column
# Say we have data stored in 2D array
M = np.random.random((3, 4))
print(M)

[[0.29596866 0.89386086 0.9352726  0.44807853]
 [0.1573482  0.92092396 0.04952993 0.40528304]
 [0.53333441 0.38989423 0.52967545 0.6599589 ]]


In [10]:
# By default, each NumPy aggregate function will return aggregate over entire array
M.sum()

6.219128783765727

In [12]:
# Aggregation functions take additional argument specifying axis along which the aggregation is computed
# Ex. We can find min value within each column by specifying axis=0
M.min(axis=0)

array([0.1573482 , 0.38989423, 0.04952993, 0.40528304])

In [13]:
# Min value for each row we use axis=1
M.min(axis=1)

array([0.29596866, 0.04952993, 0.38989423])

In [16]:
# Aggregates in NumPy are extremely useful for summarizing set of values
# As an example, consider heights of US presidents
# Data available in the file president_heights.csv
# We will use the Pandas package to read the file and extract information
import pandas as pd
data = pd.read_csv('PythonDataScienceHandbook/notebooks/data/president_heights.csv')
heights = np.array(data['height(cm)'])
print(heights)

IOError: File PythonDataScienceHandbook/notebooks/data/president_heights.csv does not exist