In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import numpy as np

In [3]:
"""
this will include more internal details about the ndarray type and more advanced array manipulations and algorithms
"""
# ndarray object internals
"""
The NumPy ndarray provides a means to interpret a block of homogeneous data
(either contiguous or strided) as a multidimensional array object. The data type, or dtype, 
determines how the data is interpreted as being floating point, integer, boolean,
or any of the other types we’ve been looking at
Part of what makes ndarray flexible is that every array object is a strided view on a
block of data. You might wonder, for example, how the array view arr[::2, ::-1]
does not copy any data. The reason is that the ndarray is more than just a chunk of
memory and a dtype; it also has “striding” information that enables the array to move
through memory with varying step sizes. More precisely, the ndarray internally consists
of the following:
"""
np.ones((10, 5)).shape
np.ones((3, 4, 5), dtype=np.float64).strides
obj = np.ones((3, 4, 5), dtype=np.float64)
obj[::-1]

'\nthis will include more internal details about the ndarray type and more advanced array manipulations and algorithms\n'

'\nThe NumPy ndarray provides a means to interpret a block of homogeneous data\n(either contiguous or strided) as a multidimensional array object. The data type, or dtype, \ndetermines how the data is interpreted as being floating point, integer, boolean,\nor any of the other types we’ve been looking at\nPart of what makes ndarray flexible is that every array object is a strided view on a\nblock of data. You might wonder, for example, how the array view arr[::2, ::-1]\ndoes not copy any data. The reason is that the ndarray is more than just a chunk of\nmemory and a dtype; it also has “striding” information that enables the array to move\nthrough memory with varying step sizes. More precisely, the ndarray internally consists\nof the following:\n'

(10, 5)

(160, 40, 8)

array([[[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]],

       [[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]],

       [[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]]])

In [4]:
# NumPy dtype Hierarchy
"""
dtypes have superclasses such as np.integer, np.floating
used in conjunction with np.issubdtype
"""
ints = np.ones(10, dtype=np.uint16)
floats = np.ones(10, dtype=np.float32)
np.issubdtype(ints.dtype, np.integer)
np.issubdtype(floats.dtype, np.floating)
# see the parent classes of a specific dtype by calling the type's mro method
np.float64.mro()
np.float32.mro()

'\ndtypes have superclasses such as np.integer, np.floating\nused in conjunction with np.issubdtype\n'

True

True

[numpy.float64,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 float,
 object]

[numpy.float32,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 object]

In [5]:
# advanced array manipulation
# reshape arrays
arr = np.arange(8)
arr.reshape((4, 2))
arr.reshape((4, 2)).reshape((2, 4))
arr.reshape((4, 2), order='C')  # row major(default)
arr.reshape((4, 2), order='F')  # column major
# one of the passed shape dimensions can be -1, in which case the value used for that dimension will be inferred from the data
arr = np.arange(15)
arr.reshape((5, -1))
# the opposite operation of reshape is known as flattening or raveling
arr = np.arange(15).reshape((5, 3))
arr
arr.ravel()  # ravel does not produce a copy of the underlying values if the values in the result were contiguous in the original array
arr.flatten()   # the flatten method behaves like ravel except it always returns a copy of the data


array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

array([[0, 4],
       [1, 5],
       [2, 6],
       [3, 7]])

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [6]:
# c v.s. fortran order
arr = np.arange(12).reshape((3, 4))
arr
arr.ravel('C')  # c(row major), traverse higher dimensions first. by default
arr.ravel('F')  # fortran(column major)

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

In [7]:
# concatenating and splitting arrays
# numpy.concatenate takes a sequence(tuple, list) of arrays and joins them together in order along the input axis
arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])
np.concatenate([arr1, arr2], axis=0)
np.concatenate([arr1, arr2], axis=1)
# there are some convenience functions like vstack, hstack
np.vstack((arr1, arr2))
np.hstack((arr1, arr2))

# split
arr = np.random.randn(5, 2)
arr
first, second, third = np.split(arr, [1, 3])    # 1, 3 indicate the index to split
first
second
third

# stacking helpers: r_ and c_
arr = np.arange(6)
arr1 = arr.reshape((3, 2))
arr2 = np.random.randn(3, 2)
np.r_[arr1, arr2]
np.c_[np.r_[arr1, arr2], arr]


array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

array([[ 0.64030984,  0.12620815],
       [ 0.33217546, -0.73193776],
       [-0.41044585, -0.40043271],
       [-0.09112562,  0.50914895],
       [-0.20081111,  0.08192198]])

array([[0.64030984, 0.12620815]])

array([[ 0.33217546, -0.73193776],
       [-0.41044585, -0.40043271]])

array([[-0.09112562,  0.50914895],
       [-0.20081111,  0.08192198]])

array([[ 0.        ,  1.        ],
       [ 2.        ,  3.        ],
       [ 4.        ,  5.        ],
       [-0.62193507,  1.47836399],
       [ 0.28735421, -0.28968613],
       [-0.98297258,  0.53028796]])

array([[ 0.        ,  1.        ,  0.        ],
       [ 2.        ,  3.        ,  1.        ],
       [ 4.        ,  5.        ,  2.        ],
       [-0.62193507,  1.47836399,  3.        ],
       [ 0.28735421, -0.28968613,  4.        ],
       [-0.98297258,  0.53028796,  5.        ]])

In [8]:
# Repeating Elements: tile and repeat
# repeat replicates each element in an array some number of times, producing a larger array
arr = np.arange(3)
arr
arr.repeat(3)
arr_2d = np.random.randn(2, 2)
arr_2d.repeat(2, axis=0)
arr_2d.repeat([2, 3], axis=0)  # pass an array of ints to repeat different number of times

# tile is a shortcut for stacking copies of an array along an axis
arr_2d
np.tile(arr_2d, 2)
np.tile(arr_2d, (1, 2))
np.tile(arr_2d, (3, 2))

array([0, 1, 2])

array([0, 0, 0, 1, 1, 1, 2, 2, 2])

array([[ 0.20982655, -0.22047142],
       [ 0.20982655, -0.22047142],
       [ 0.45692333,  0.18895794],
       [ 0.45692333,  0.18895794]])

array([[ 0.20982655, -0.22047142],
       [ 0.20982655, -0.22047142],
       [ 0.45692333,  0.18895794],
       [ 0.45692333,  0.18895794],
       [ 0.45692333,  0.18895794]])

array([[ 0.20982655, -0.22047142],
       [ 0.45692333,  0.18895794]])

array([[ 0.20982655, -0.22047142,  0.20982655, -0.22047142],
       [ 0.45692333,  0.18895794,  0.45692333,  0.18895794]])

array([[ 0.20982655, -0.22047142,  0.20982655, -0.22047142],
       [ 0.45692333,  0.18895794,  0.45692333,  0.18895794]])

array([[ 0.20982655, -0.22047142,  0.20982655, -0.22047142],
       [ 0.45692333,  0.18895794,  0.45692333,  0.18895794],
       [ 0.20982655, -0.22047142,  0.20982655, -0.22047142],
       [ 0.45692333,  0.18895794,  0.45692333,  0.18895794],
       [ 0.20982655, -0.22047142,  0.20982655, -0.22047142],
       [ 0.45692333,  0.18895794,  0.45692333,  0.18895794]])

In [9]:
# fancy indexing equivalents: take and put
arr = np.arange(10) * 100
inds = [7, 1, 2, 6]
arr[inds]
arr.take(inds)
arr.put(inds, 23)
arr
arr.put(inds, [40, 41, 42, 43])
arr

inds = [2, 0, 2, 1]
arr = np.random.randn(2, 4)
arr
arr.take(inds, axis=1)

array([700, 100, 200, 600])

array([700, 100, 200, 600])

array([  0,  23,  23, 300, 400, 500,  23,  23, 800, 900])

array([  0,  41,  42, 300, 400, 500,  43,  40, 800, 900])

array([[-0.12035594, -0.41940418, -1.64780288, -0.32510427],
       [-0.50470904,  0.04540897, -0.22737202, -0.19368248]])

array([[-1.64780288, -0.12035594, -1.64780288, -0.41940418],
       [-0.22737202, -0.50470904, -0.22737202,  0.04540897]])

In [14]:
# broadcasting
# broadcasting describes how arithmetic works between arrays of different shapes
arr = np.arange(5)
arr
arr * 4

arr = np.random.randn(4, 3)
arr
arr.mean(0)
demeaned = arr - arr.mean(0)
demeaned
demeaned.mean(0)
"""
Two arrays are compatible for broadcasting if for each trailing dimension (i.e., starting
from the end) the axis lengths match or if either of the lengths is 1. Broadcasting is
then performed over the missing or length 1 dimensions.
"""

# broadcasting over other axes
arr - arr.mean(1).reshape((4, 1))
arr - arr.mean(1)


array([0, 1, 2, 3, 4])

array([ 0,  4,  8, 12, 16])

array([[-1.48795043,  0.37073387,  0.0020737 ],
       [ 0.85362738, -0.46068449, -0.29131907],
       [-1.41630578,  0.44582262,  0.10585292],
       [ 0.14578223, -1.02013819, -0.63176927]])

array([-0.47621165, -0.16606655, -0.20379043])

array([[-1.01173878,  0.53680042,  0.20586414],
       [ 1.32983903, -0.29461794, -0.08752864],
       [-0.94009413,  0.61188916,  0.30964335],
       [ 0.62199388, -0.85407164, -0.42797884]])

array([ 0.00000000e+00, -2.77555756e-17, -1.38777878e-17])

'\nTwo arrays are compatible for broadcasting if for each trailing dimension (i.e., starting\nfrom the end) the axis lengths match or if either of the lengths is 1. Broadcasting is\nthen performed over the missing or length 1 dimensions.\n'

array([[-1.11623614,  0.74244816,  0.37378799],
       [ 0.81975277, -0.49455909, -0.32519368],
       [-1.1280957 ,  0.7340327 ,  0.394063  ],
       [ 0.64782398, -0.51809645, -0.12972753]])

ValueError: operands could not be broadcast together with shapes (4,3) (4,) 

In [16]:
# advanced ufunc usage
# reduce takes a single array and aggregates its values
arr = np.arange(10)
np.add.reduce(arr)
arr.sum()

np.random.seed(12346)
arr = np.random.randn(5, 5)
arr[::2].sort(1)
arr[:, :-1] < arr[:, 1:]
np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)

arr = np.arange(10)
np.add.reduceat(arr, [0, 5, 8])


45

45

array([[ True,  True,  True,  True],
       [False,  True, False, False],
       [ True,  True,  True,  True],
       [ True, False,  True,  True],
       [ True,  True,  True,  True]])

array([ True, False,  True, False,  True])

array([10, 18, 17], dtype=int32)

In [17]:
# write new ufuncs in Python
# numpy.frompyfunc accepts a Python function along with a specification for the number of inputs and outputs
def add_elements(x, y):
    return x + y


add_them = np.frompyfunc(add_elements, 2, 1)
add_them(np.arange(8), np.arange(8))
# These functions provide a way to create ufunc-like functions, but they are very slow
# because they require a Python function call to compute each element, which is a lot
# slower than NumPy’s C-based ufunc loops:

array([0, 2, 4, 6, 8, 10, 12, 14], dtype=object)