Start: Numpy Basics from Python for Data Analysis book by Wes McKinney

In [1]:
import numpy as np

Topics for Data Analysis:
- Fast vectorized array operations for data munging and cleaning, subsetting and
filtering, transformation, and any other kinds of computations
- Common array algorithms like sorting, unique, and set operations
- Efficient descriptive statistics and aggregating/summarizing data
- Data alignment and relational data manipulations for merging and joining
together heterogeneous datasets
- Expressing conditional logic as array expressions instead of loops with if-elifelse
branches
- Group-wise data manipulations (aggregation, transformation, function application)

4.1 The NumPy ndarray: A Multidimensional Array Object

In [3]:
# Generate a random set of data
data = np.random.rand(2, 3)
print(data)

[[0.53695294 0.62550069 0.2824244 ]
 [0.3290826  0.45521543 0.51167884]]


In [6]:
# Mathematical operations on the data
mul = data * 10
print(mul)

add = data + data
print(add)

[[5.36952936 6.25500694 2.82424403]
 [3.29082604 4.55215429 5.11678844]]
[[1.07390587 1.25100139 0.56484881]
 [0.65816521 0.91043086 1.02335769]]


In [11]:
# Get the shape and type of the array
shape = data.shape
print(shape)

type = data.dtype
print(type)

(2, 3)
float64


In [14]:
# Creating ndarrays with one dimension
data1 = [1, 1.2, 3, 5.5, -12, -7.8]
data1 = np.array(data1)
print(data1)

[  1.    1.2   3.    5.5 -12.   -7.8]


In [17]:
# Creating ndarrays with multiple dimensions
data2 = [[1, 2, 3, 4], [5, 6, 7, 8]]
data2 = np.array(data2)
print(data2)

print(f"Dimension of the array {data2}: {data2.ndim} ")
print(f"Shape of the array {data2}: {data2.shape} ")

[[1 2 3 4]
 [5 6 7 8]]
Dimension of the array [[1 2 3 4]
 [5 6 7 8]]: 2 
Shape of the array [[1 2 3 4]
 [5 6 7 8]]: (2, 4) 


In [18]:
# The ndarrays are automatically typed
print(f" Type of {data1} is {data1.dtype}")
print(f" Type of {data2} is {data2.dtype}")

 Type of [  1.    1.2   3.    5.5 -12.   -7.8] is float64
 Type of [[1 2 3 4]
 [5 6 7 8]] is int64


In [25]:
# Create array of zeros
print(np.zeros(10))
# Create array of ones (tuple as a shape)
print(np.ones((10,2)))



[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[[1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]]


In [32]:
# Create an array without initializing its values
np.empty((2, 3, 2))

array([[[6.95335390e-310, 1.07686503e-311],
        [0.00000000e+000, 1.07686503e-311],
        [1.89146896e-307, 1.07686503e-311]],

       [[9.82205330e+252, 0.00000000e+000],
        [1.07686503e-311, 1.07686503e-311],
        [1.07686503e-311, 0.00000000e+000]]])

In [31]:
# Create an array-valued version of the built-in Python range function
np.arange(start=5, stop=100, step=5, dtype=float)

array([ 5., 10., 15., 20., 25., 30., 35., 40., 45., 50., 55., 60., 65.,
       70., 75., 80., 85., 90., 95.])

In [34]:
# Produce an array of the given shape and dtype with all values set to the indicated “fill value”
np.full((5,2), 12)

array([[12, 12],
       [12, 12],
       [12, 12],
       [12, 12],
       [12, 12]])

In [41]:
arrray_zeros = np.zeros((2,3))
# full_like takes another array and produces a filled array of the same shape and dtype
filled_array_twos = np.full_like(arrray_zeros, 2)
print(arrray_zeros.shape, filled_array_twos.shape)
print(arrray_zeros)
print(filled_array_twos)

(2, 3) (2, 3)
[[0. 0. 0.]
 [0. 0. 0.]]
[[2. 2. 2.]
 [2. 2. 2.]]


In [46]:
# Create identity matrices
identity_matrix1 = np.eye(10,10)
identity_matrix2 = np.identity(10)
print(identity_matrix1)
print(identity_matrix2)


[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


Data Types for ndarrays

In [54]:
arr1 = np.array([1, 2, 3], dtype=np.complex128,)
arr1.dtype
print(arr1)

[1.+0.j 2.+0.j 3.+0.j]


In [50]:
arr2 = np.array([1, 2, 3], dtype=np.int32)
arr2.dtype

dtype('int32')

List of types supported by NumPy:
- int8 (i1), uint8 (u1): Signed and unsigned 8-bit (1 byte) integer types
- int16 (i2), uint16 (u2): Signed and unsigned 16-bit integer types
- int32 (i4), uint32 (u4): Signed and unsigned 32-bit integer types
- int64 (i8), uint64 (u8): Signed and unsigned 64-bit integer types
- float16 (f2): Half-precision floating point
- float32 (f4 or f): Standard single-precision floating point; compatible with C float
- float64 (f8 or d): Standard double-precision floating point; compatible with C double and Python float object
- float128 (f16 or g): Extended-precision floating point
- complex64 (c8), complex128 (c16), complex256 (c32): Complex numbers represented by two 32, 64, or 128 floats, respectively
- bool (?): Boolean type storing True and False values
- object (O): Python object type; a value can be any Python object
- string (S): Fixed-length ASCII string type (1 byte per character); for example, to create a string dtype with length 10, use 'S10'
- unicode (U): Fixed-length Unicode type (number of bytes platform specific); same
specification semantics as string_ (e.g., 'U10')

In [58]:
# Cast an array explicitly
arr = np.array([1, 2, 3, 4, 5])
print(f"First type of the array {arr.dtype}")
float_arr = arr.astype(np.float64)
print(f"Casted type of the array {float_arr.dtype}")

First type of the array int64
Casted type of the array float64


In [61]:
# Loss of information during cast
arr = np.array([3.7, -1.2, -2.6, 0.5, 12.9, 10.1])
print(f"Information of the array with floating points {arr}")
# Cast of the array to integers
int_arr = arr.astype(np.int32)
print(f"Information of the array with integers {int_arr}")

Information of the array with floating points [ 3.7 -1.2 -2.6  0.5 12.9 10.1]
Information of the array with integers [ 3 -1 -2  0 12 10]


In [66]:
# Cast array of strings representing numeric values (use np.bytes_ instead of np.string since NumPy 2.0)
numeric_strings = np.array(['1.25', '-9.6', '42'], dtype=np.bytes_)
numeric_strings = numeric_strings.astype(float)
print(f"Casted string to numerical floating point values {numeric_strings.dtype}")

Casted string to numerical floating point values float64


In [68]:
# Use shorthand as dtypes
empty_uint32 = np.empty(8, dtype='u4')
print(empty_uint32, empty_uint32.dtype)



[4128860 6029375 3801155 5570652 6619251 7536754 5374044 7209071] uint32


Arithmetic with NumPy arrays

In [71]:
# Use batch operations without loops
arr = np.array([[1., 2., 3.], [4., 5., 6.]])

arr ** 2

array([[ 1.,  4.,  9.],
       [16., 25., 36.]])

In [75]:
# Comparison between two arrays create a new array with boolean values
arr1 = np.array([[1., 2., 3.], [4., 5., 6.]])
arr2 = np.array([[0., 4., 1.], [7., 2., 12.]])

print(arr1 > arr2)

[[ True False  True]
 [False  True False]]


Boolean indexing

In [78]:
names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])
data = np.random.randn(7, 4)
print(names, names.dtype, names.shape)
print(data, data.dtype, data.shape)


['Bob' 'Joe' 'Will' 'Bob' 'Will' 'Joe' 'Joe'] <U4 (7,)
[[-0.40764714  0.64944915  0.80999119  1.35200816]
 [ 0.1371835   2.44639993 -1.33397374  2.49283723]
 [-0.63690989  0.0305682  -0.57360816  0.44265198]
 [ 0.97810634  0.13622451  1.45709524 -1.45656612]
 [ 0.12199499  1.09693251 -0.7735349  -0.46272407]
 [-1.76299447  0.4496139  -0.43320698 -0.34583841]
 [ 0.27405261 -1.35493529 -0.12472201 -0.37040436]] float64 (7, 4)


In [86]:
# Boolean selection
print(names == "Bob")
print(data[names == "Bob"])
# Boolean selection with indexing
print(data[names == "Bob", 2:])

[ True False False  True False False False]
[[-0.40764714  0.64944915  0.80999119  1.35200816]
 [ 0.97810634  0.13622451  1.45709524 -1.45656612]]
[[ 0.80999119  1.35200816]
 [ 1.45709524 -1.45656612]]


In [87]:
# Selection with inverted condition
cond = names == 'Bob'
print(data[~cond])

[[ 0.1371835   2.44639993 -1.33397374  2.49283723]
 [-0.63690989  0.0305682  -0.57360816  0.44265198]
 [ 0.12199499  1.09693251 -0.7735349  -0.46272407]
 [-1.76299447  0.4496139  -0.43320698 -0.34583841]
 [ 0.27405261 -1.35493529 -0.12472201 -0.37040436]]


In [89]:
# Selection with inverted condition with creation of a mask
mask = (names == 'Bob') | (names == 'Will')
print(data[~mask])

## The Python keywords and and or do not work with boolean arrays we have to use & (and) and | (or) instead ##

[[ 0.1371835   2.44639993 -1.33397374  2.49283723]
 [-1.76299447  0.4496139  -0.43320698 -0.34583841]
 [ 0.27405261 -1.35493529 -0.12472201 -0.37040436]]


In [99]:
# Example of boolean condition: set the negative values to 0
data = np.random.randn(7, 4)
data[data < 0] = 0
print(data)


[[0.29604812 0.         0.10107219 0.        ]
 [0.16545129 1.04071659 0.99937298 0.16557564]
 [0.         1.84016877 0.         0.        ]
 [0.55462928 0.         0.         0.        ]
 [0.         1.84531366 2.0720518  0.        ]
 [0.81644163 0.         0.92927285 0.        ]
 [0.58252577 1.44615643 0.68798249 0.        ]]


Fancy indexing\
- Definition: it is a term adopted by NumPy to describe indexing using integer arrays

In [103]:
arr = np.empty((8, 4), dtype=int)
for i in range(8):
    arr[i] = i

print(arr)

[[0 0 0 0]
 [1 1 1 1]
 [2 2 2 2]
 [3 3 3 3]
 [4 4 4 4]
 [5 5 5 5]
 [6 6 6 6]
 [7 7 7 7]]


In [106]:
# In order to select out a subset of the rows in a particular order, you can simply pass a list or
#ndarray of integers specifying the desired order
print(arr[[4, 3, 0, 6]])

# Using negative indices specify the rows to select in reverse order
arr[[-3, -5, -7]]
# Indices: 8-3=5  8-5=3 8-7=1

[[4 4 4 4]
 [3 3 3 3]
 [0 0 0 0]
 [6 6 6 6]]


array([[5, 5, 5, 5],
       [3, 3, 3, 3],
       [1, 1, 1, 1]])

In [109]:
# Passing multiple index arrays does something slightly different; it selects a onedimensional
#array of elements corresponding to each tuple of indices:
arr = np.arange(32).reshape((8, 4))
print(arr)

select_arr = arr[[1, 5, 7, 2], [0, 3, 1, 2]]
print(select_arr)  # Output: [4, 23, 29, 10] -> positions of the values = [(0,1), (3,5), (1,7), (2,2)]



[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]
 [16 17 18 19]
 [20 21 22 23]
 [24 25 26 27]
 [28 29 30 31]]
[ 4 23 29 10]


Transposing Arrays and Swapping Axes

In [115]:
arr = np.arange(9).reshape((3, 3))
print(arr)

# Transpose the array
arr_t1 = np.transpose(arr)
print(arr_t1)

arr_t2 = arr.T
print(arr_t2)

[[0 1 2]
 [3 4 5]
 [6 7 8]]
[[0 3 6]
 [1 4 7]
 [2 5 8]]
[[0 3 6]
 [1 4 7]
 [2 5 8]]


In [122]:
# Swap axes
arr = np.arange(16).reshape((2, 2, 4))
print(arr)
swap_ax = arr.swapaxes(1,2)
print(swap_ax)

[[[ 0  1  2  3]
  [ 4  5  6  7]]

 [[ 8  9 10 11]
  [12 13 14 15]]]
[[[ 0  4]
  [ 1  5]
  [ 2  6]
  [ 3  7]]

 [[ 8 12]
  [ 9 13]
  [10 14]
  [11 15]]]
