In [10]:
# This section demonstrates use of NumPy's structured arrays and record arrays
# Provides efficient storage for compound, heterogeneous data
# Lets say we have several categories of data on a number of people (name, age, weight)
# We want to store these values for use in a python program
# Possible to store in 3 seperate arrays:
name = ['Alice', 'Bob', 'Cathy', 'Doug']
age = [25, 45, 37, 19]
weight = [55.0, 85.5, 68.0, 61.5]
# This is a bit clumsy since there is nothing that tells us the 3 arrays are related
# More natural to use single structure to store all this data, which NumPy does through structured arrays
# Structured arrays are arrays with compound data types
# Recall in previous section we created a simple array like this:
import numpy as np
x = np.zeros(4, dtype=int)
# We can similarily create a structured array using a compound data type specification:
data = np.zeros(4, dtype={'names':('name', 'age', 'weight'),
                           'formats':('U10', 'i4', 'f8')})
print(data.dtype)

[('name', '<U10'), ('age', '<i4'), ('weight', '<f8')]


In [11]:
# U10 = Unicode string of maximum length 10
# i4 = 4-byte (i.e, 32-bit) integer
# f8 = 8-byte (i.e, 64-bit) float
# Now that we have created an empty container array, we can fill the array with our lists of values
data['name'] = name
data['age'] = age
data['weight'] = weight
print(data)

[(u'Alice', 25, 55. ) (u'Bob', 45, 85.5) (u'Cathy', 37, 68. )
 (u'Doug', 19, 61.5)]


In [12]:
# As we wanted, data is now arranged in one convenient block of memory
# Can now refer to values either by index or by name:
# Get all names
data['name']

array([u'Alice', u'Bob', u'Cathy', u'Doug'], dtype='<U10')

In [13]:
# Get first row of data:
data[0]

(u'Alice', 25, 55.)

In [14]:
# Get the name from the last row:
data[-1]['name']

u'Doug'

In [15]:
# Using boolean masking, this even allows you to do some more sophisticated operations such as filtering on age:
# Get names where age is under 30:
data[data['age'] < 30]['name']

array([u'Alice', u'Doug'], dtype='<U10')

In [16]:
# Structured data can be specified in number of ways
# We have seen the dictionary method:
np.dtype({'names':('name', 'age', 'weight'),
          'formats':('U10', 'i4', 'f8')})

dtype([('name', '<U10'), ('age', '<i4'), ('weight', '<f8')])

In [17]:
# For clarity, numerical types can be specified with Python types or NumPy dtypes instead:
np.dtype({'names':('name', 'age', 'weight'),
          'formats':((np.str_, 10), int, np.float32)})

dtype([('name', 'S10'), ('age', '<i4'), ('weight', '<f4')])

In [18]:
# Compound type can also be specified as a list of tuples:
np.dtype([('name', 'S10'), ('age', 'i4'), ('weight', 'f8')])

dtype([('name', 'S10'), ('age', '<i4'), ('weight', '<f8')])

In [19]:
# If the names of the types do not matter to you, you can specify the types alone in a comma-seperated string:
np.dtype('S10, i4, f8')

dtype([('f0', 'S10'), ('f1', '<i4'), ('f2', '<f8')])

In [None]:
# The '<i4' or '<f8' (string format codes) seem confusing but are built on simple principles
# The first (optional) character is < or > which mwnas 'little endian' or 'big endian' and specifies ordering convention for significant bits
# Next character specifies the type of data (characters, bytes, ints, floating points, etc.)

In [20]:
# NumPy also provides np.recarray class which is identical to structured arrays but for one difference
# Fields can be accessed as attributes rather than as dictionary keys
# Recall that we accessed ages by writing:
data['age']

array([25, 45, 37, 19])

In [21]:
# If we view our data as a record array instead, we can access this with slightly fewer key strokes:
data_rec = data.view(np.recarray)
data_rec.age

array([25, 45, 37, 19])

In [23]:
# Downside is for record arrays, there is some extra overhead invovled in accessing the fields
%timeit data['age']
%timeit data_rec['age']
%timeit data_rec.age

The slowest run took 66.07 times longer than the fastest. This could mean that an intermediate result is being cached.
10000000 loops, best of 3: 77.3 ns per loop
The slowest run took 5.97 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 2.48 µs per loop
100000 loops, best of 3: 4.05 µs per loop
