# NumPy Basics: Arrays and Vectorized Computation

In [53]:
import numpy as np
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

In [54]:
import numpy as np
my_arr = np.arange(1000000)
my_list = list(range(1000000))

In [55]:
%time for _ in range(10): my_arr2 = my_arr * 2
%time for _ in range(10): my_list2 = [x * 2 for x in my_list]

CPU times: user 16.1 ms, sys: 16.9 ms, total: 33 ms
Wall time: 45 ms
CPU times: user 293 ms, sys: 77.8 ms, total: 371 ms
Wall time: 381 ms


## The NumPy ndarray: A Multidimensional Array Object
The key feature of NumPy is its N-dimensional array object, or `ndarray` 
- a fast, flexible container for large datasets in Python
- similar syntax to the equivalent operations between scalar elements

In [56]:
import numpy as np
# Generate some random data
data = np.random.randn(2, 3)
data

array([[-0.2047,  0.4789, -0.5194],
       [-0.5557,  1.9658,  1.3934]])

In [57]:
data * 10
data + data

array([[-0.4094,  0.9579, -1.0389],
       [-1.1115,  3.9316,  2.7868]])

An ndarray is a generic multidimensional container for homogeneous data
- all of the elements must be the same type
- every array has a shape
    - a tuple indicating the size of each dimension and a dtype, an object describing the data type of the array

In [58]:
data.shape, data.dtype

((2, 3), dtype('float64'))

With the `list` would be much different

In [59]:
data_list = data.tolist()
data_list

[[-0.20470765948471295, 0.47894333805754824, -0.5194387150567381],
 [-0.55573030434749, 1.9657805725027142, 1.3934058329729904]]

In [60]:
# replicate
data_list * 10

[[-0.20470765948471295, 0.47894333805754824, -0.5194387150567381],
 [-0.55573030434749, 1.9657805725027142, 1.3934058329729904],
 [-0.20470765948471295, 0.47894333805754824, -0.5194387150567381],
 [-0.55573030434749, 1.9657805725027142, 1.3934058329729904],
 [-0.20470765948471295, 0.47894333805754824, -0.5194387150567381],
 [-0.55573030434749, 1.9657805725027142, 1.3934058329729904],
 [-0.20470765948471295, 0.47894333805754824, -0.5194387150567381],
 [-0.55573030434749, 1.9657805725027142, 1.3934058329729904],
 [-0.20470765948471295, 0.47894333805754824, -0.5194387150567381],
 [-0.55573030434749, 1.9657805725027142, 1.3934058329729904],
 [-0.20470765948471295, 0.47894333805754824, -0.5194387150567381],
 [-0.55573030434749, 1.9657805725027142, 1.3934058329729904],
 [-0.20470765948471295, 0.47894333805754824, -0.5194387150567381],
 [-0.55573030434749, 1.9657805725027142, 1.3934058329729904],
 [-0.20470765948471295, 0.47894333805754824, -0.5194387150567381],
 [-0.55573030434749, 1.9657805

In [61]:
# concatenate
data_list + data_list

[[-0.20470765948471295, 0.47894333805754824, -0.5194387150567381],
 [-0.55573030434749, 1.9657805725027142, 1.3934058329729904],
 [-0.20470765948471295, 0.47894333805754824, -0.5194387150567381],
 [-0.55573030434749, 1.9657805725027142, 1.3934058329729904]]

In [62]:
# instead of ((2, 3), dtype('float64'))
len(data_list), type(data_list)

(2, list)

### Creating ndarrays
The easiest way to create an array is to use the `array` function

In [63]:
data1 = [6, 7.5, 8, 0, 1]
arr1 = np.array(data1)
arr1

array([6. , 7.5, 8. , 0. , 1. ])

lists of lists are automatically converted into a ndarray:

In [64]:
data2 = [[1, 2, 3, 4], [5, 6, 7, 8]]
arr2 = np.array(data2)
arr2

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

In [65]:
print(arr2.ndim)
print(arr2.shape)

2
(2, 4)


### dtype
`np.array` tries to infer the data type for the array
- the data type is stored in a special `dtype` metadata object

In [66]:
print(arr1.dtype)
print(arr2.dtype)

float64
int64


### initialized ndarrays
We can initialize arrays with the dimesion we prefer with zeros, ones, or ranges

In [67]:
np.zeros(10)
np.zeros((3, 6))
np.empty((2, 3, 2))

array([[[0.0000e+000, 2.4703e-322],
        [0.0000e+000, 0.0000e+000],
        [2.1432e-312, 3.6978e-062]],

       [[7.7318e-091, 2.6507e-032],
        [1.5247e-052, 1.5362e-051],
        [3.9991e+252, 1.6092e+295]]])

In [68]:
np.arange(15)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

### Data Types for ndarrays
The data type or `dtype` is a special object containing the metadata (i.e., data about data) the ndarray needs to interpret a chunk of memory as a particular type of data

- `dtypes` are a source of NumPy’s flexibility for interacting with data coming from other systems
- in most cases they provide a mapping directly onto an underlying disk or memory representation
    - makes it easy to read and write binary streams of data to disk 
    - makes it easy to connect to code written in a low-level language like C or Fortran

In [69]:
arr1 = np.array([1, 2, 3], dtype=np.float64)
arr2 = np.array([1, 2, 3], dtype=np.int32)
print(arr1.dtype)
print(arr2.dtype)

float64
int32


In [70]:
arr = np.array([1, 2, 3, 4, 5])
arr.dtype
float_arr = arr.astype(np.float64)
float_arr.dtype

dtype('float64')

In [71]:
arr = np.array([3.7, -1.2, -2.6, 0.5, 12.9, 10.1])
print(arr)
print(arr.astype(np.int32))

[ 3.7 -1.2 -2.6  0.5 12.9 10.1]
[ 3 -1 -2  0 12 10]


In [72]:
numeric_strings = np.array(['1.25', '-9.6', '42'], dtype=np.string_)
numeric_strings.astype(float)

array([ 1.25, -9.6 , 42.  ])

In [73]:
int_array = np.arange(10)
calibers = np.array([.22, .270, .357, .380, .44, .50], dtype=np.float64)
int_array.astype(calibers.dtype)

array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])

### reference copy
Remeber that as for non-scalar object, we are always copying only the reference

In [80]:
ref_copy = int_array
ref_copy[0]=1
int_array

array([1, 1, 2, 3, 4, 5, 6, 7, 8, 9])

#### hard copy needs its command

In [84]:
hard_copy = int_array.copy()
hard_copy[0] = 222
int_array

array([1, 1, 2, 3, 4, 5, 6, 7, 8, 9])

### Arithmetic with NumPy Arrays
- Arrays are important because they enable you to express batch operations on data **without writing any for loops**
- NumPy users call this *vectorization*
- Any arithmetic operations between equal-size arrays applies the operation element-wise

In [88]:
arr = np.array([[1., 2., 3.], [4., 5., 6.]])
print("arr:")
print(arr)
print()
print("arr * arr:")
print(arr * arr)
print()
print("arr - arr:")
print(arr - arr)

arr:
[[1. 2. 3.]
 [4. 5. 6.]]

arr * arr:
[[ 1.  4.  9.]
 [16. 25. 36.]]

arr - arr:
[[0. 0. 0.]
 [0. 0. 0.]]


In [90]:
arr ** 0.5

array([[1.    , 1.4142, 1.7321],
       [2.    , 2.2361, 2.4495]])

In [21]:
arr2 = np.array([[0., 4., 1.], [7., 2., 12.]])
arr2

array([[False,  True, False],
       [ True, False,  True]])

In [91]:
arr2 > arr

array([[False, False, False],
       [False, False, False]])

### Basic Indexing and Slicing
One-dimensional arrays are simple:
- interface very similarly to Python lists

In [93]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

#### slices are views on the original array
- An important first distinction from Python’s built-in lists is that array slices are views on the original array

In [98]:
print(arr[5])
print(arr[5:8])
arr[5:8] = 12
arr

12
[12 12 12]


array([ 0,  1,  2,  3,  4, 12, 12, 12,  8,  9])

#### broadcasting
- when we assign the same value to all the element of a slice

In [102]:
arr_slice = arr[5:8]
arr_slice

array([64, 64, 64])

In [103]:
arr_slice[1] = 12345
arr

array([    0,     1,     2,     3,     4,    64, 12345,    64,     8,
           9])

In [104]:
# broadcasting
arr_slice[:] = 64
arr

array([ 0,  1,  2,  3,  4, 64, 64, 64,  8,  9])

#### 2d array
In a two-dimensional array, the elements at each index are no longer scalars but rather one-dimensional arrays

In [107]:
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
arr2d[2]

array([7, 8, 9])

In [108]:
arr2d[0][2]
arr2d[0, 2]

3

In [109]:
# 3d array, and so forth
arr3d = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
arr3d

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [110]:
arr3d[0]

array([[1, 2, 3],
       [4, 5, 6]])

In [113]:
old_values = arr3d[0].copy()
arr3d[0] = 42
arr3d

array([[[42, 42, 42],
        [42, 42, 42]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [114]:
arr3d[0] = old_values
arr3d

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

#### Indexing with slices

In [115]:
print(arr)
arr[1:6]

[ 0  1  2  3  4 64 64 64  8  9]


array([ 1,  2,  3,  4, 64])

In [123]:
print(arr2d)
arr2d[:2]

[[1 2 3]
 [4 5 6]
 [7 8 9]]


array([[1, 2, 3],
       [4, 5, 6]])

In [124]:
arr2d[:2, 1:]

array([[2, 3],
       [5, 6]])

In [125]:
arr2d[1, :2]

array([4, 5])

In [126]:
arr2d[:2, 2]

array([3, 6])

### Boolean Indexing
- with inline expression we can create array of booleans

In [146]:
names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])
data = np.random.randn(7, 4)
names,data

(array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'], dtype='<U4'),
 array([[ 0.6692, -1.649 , -2.2528, -1.1668],
        [ 0.3536,  0.7021, -0.2746, -0.1391],
        [ 0.1077, -0.6065, -0.4171, -0.017 ],
        [-1.2241, -1.8008,  1.6347,  0.989 ],
        [ 0.4579,  0.5552,  1.3067, -0.4406],
        [-0.3014,  0.4988, -0.824 ,  1.3206],
        [ 0.508 , -0.6534,  0.187 , -0.3917]]))

In [147]:
names == 'Bob'

array([ True, False, False,  True, False, False, False])

#### masks
- can create masks to filter data

In [153]:
data[names == 'Bob']

array([[ 0.6692, -1.649 , -2.2528, -1.1668],
       [-1.2241, -1.8008,  1.6347,  0.989 ]])

#### ~
`~` inverts teh condition

In [154]:
names != 'Bob'
data[~(names == 'Bob')]

array([[ 0.3536,  0.7021, -0.2746, -0.1391],
       [ 0.1077, -0.6065, -0.4171, -0.017 ],
       [ 0.4579,  0.5552,  1.3067, -0.4406],
       [-0.3014,  0.4988, -0.824 ,  1.3206],
       [ 0.508 , -0.6534,  0.187 , -0.3917]])

In [155]:
cond = names == 'Bob'
data[~cond]

array([[ 0.3536,  0.7021, -0.2746, -0.1391],
       [ 0.1077, -0.6065, -0.4171, -0.017 ],
       [ 0.4579,  0.5552,  1.3067, -0.4406],
       [-0.3014,  0.4988, -0.824 ,  1.3206],
       [ 0.508 , -0.6534,  0.187 , -0.3917]])

#### masks
- we can build "masks" with boolean operators

In [156]:
mask = (names == 'Bob') | (names == 'Will')
mask
data[mask]

array([[ 0.6692, -1.649 , -2.2528, -1.1668],
       [ 0.1077, -0.6065, -0.4171, -0.017 ],
       [-1.2241, -1.8008,  1.6347,  0.989 ],
       [ 0.4579,  0.5552,  1.3067, -0.4406]])

we can create a mask on the fly, without assigning a variable

In [158]:
data[data < 0] = 0
data

array([[0.6692, 0.    , 0.    , 0.    ],
       [0.3536, 0.7021, 0.    , 0.    ],
       [0.1077, 0.    , 0.    , 0.    ],
       [0.    , 0.    , 1.6347, 0.989 ],
       [0.4579, 0.5552, 1.3067, 0.    ],
       [0.    , 0.4988, 0.    , 1.3206],
       [0.508 , 0.    , 0.187 , 0.    ]])

In [159]:
data[names != 'Joe'] = 7
data

array([[7.    , 7.    , 7.    , 7.    ],
       [0.3536, 0.7021, 0.    , 0.    ],
       [7.    , 7.    , 7.    , 7.    ],
       [7.    , 7.    , 7.    , 7.    ],
       [7.    , 7.    , 7.    , 7.    ],
       [0.    , 0.4988, 0.    , 1.3206],
       [0.508 , 0.    , 0.187 , 0.    ]])

### Transposing Arrays and Swapping Axes
NumPy is commonly used also for matrix computatoin
- opearations on matrixes are very efficient
- provides seveal methods for handling operations

For instance:
- `.reshape` reshapes the array into a matrix with the specified dimensions
- `.T` returns the matrix transpose
- `np.dot` performs the matrix multiplication

In [166]:
arr = np.arange(15).reshape((3, 5))
print(arr)
print(arr.reshape((5,3)))

[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]]
[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]
 [12 13 14]]


In [167]:
arr.T

array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])

In [168]:
arr = np.random.randn(6, 3)
arr
np.dot(arr.T, arr)

array([[ 4.5178,  0.5218, -2.2436],
       [ 0.5218,  6.9882, -5.3694],
       [-2.2436, -5.3694,  6.7075]])

## Universal Functions: Fast Element-Wise Array Functions
NumPy allows also to conveniently apply functions to each elenent of a ndarray

In [173]:
arr = np.arange(10)

print(arr)
print()
print(np.sqrt(arr))
print()
print(np.exp(arr))

[0 1 2 3 4 5 6 7 8 9]

[0.     1.     1.4142 1.7321 2.     2.2361 2.4495 2.6458 2.8284 3.    ]

[   1.        2.7183    7.3891   20.0855   54.5982  148.4132  403.4288
 1096.6332 2980.958  8103.0839]


### Mathematical and Statistical Methods
A set of mathematical functions that compute statistics about an entire array or about the data along an axis are accessible as methods of the array class
- we can use aggregations (often called reductions) like:
    - sum
    - mean
    - std (standard deviation) 

In [181]:
# arr = np.random.randn(5, 4)
arr = np.arange(20).reshape(5,4)
print(arr)
print(arr.mean())
# equivalent
print(np.mean(arr))

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]
 [16 17 18 19]]
9.5
9.5


In [182]:
arr.sum()

190

we can even perform the aggregation on the axis:
- aggregation on the rows: axis=0
- aggregation on the columns: axis=1

In [187]:
arr.mean(axis=1)

array([ 1.5,  5.5,  9.5, 13.5, 17.5])

In [188]:
arr.sum(axis=0)

array([40, 45, 50, 55])

### Sorting
NumPy sorts **in-place**

In [201]:
arr = np.random.randn(6)
arr

array([-0.2589, -0.5816, -1.2604,  0.4646, -1.0702,  0.8042])

In [202]:
arr.sort()
arr

array([-1.2604, -1.0702, -0.5816, -0.2589,  0.4646,  0.8042])

In [203]:
arr = np.random.randn(5, 3)
arr

array([[-0.1567,  2.0104, -0.8871],
       [-0.9779, -0.2672,  0.4833],
       [-0.4003,  0.4499,  0.3996],
       [-0.1516, -2.5579,  0.1608],
       [ 0.0765, -0.2972, -1.2943]])

In [204]:
arr.sort(1)
arr

array([[-0.8871, -0.1567,  2.0104],
       [-0.9779, -0.2672,  0.4833],
       [-0.4003,  0.3996,  0.4499],
       [-2.5579, -0.1516,  0.1608],
       [-1.2943, -0.2972,  0.0765]])

## File Input and Output with Arrays
NumPy is able to save and load data to and from disk
- Arrays are saved by default in an uncompressed raw binary format with file extension `.npy`

In [210]:
arr = np.arange(10)
np.save('some_array', arr)

In [211]:
np.load('some_array.npy')

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [212]:
np.savez('array_archive.npz', a=arr, b=arr)

In [213]:
arch = np.load('array_archive.npz')
arch['b']

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [221]:
np.savez_compressed('arrays_compressed.npz', a=arr, b=arr)

In [224]:
a = np.load('arrays_compressed.npz')
a['b']

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [225]:
!rm some_array.npy
!rm array_archive.npz
!rm arrays_compressed.npz

rm: some_array.npy: No such file or directory
rm: array_archive.npz: No such file or directory


## Linear Algebra
Linear algebra, like matrix multiplication, decompositions, determinants, and other square matrix math, is an important part of any array library

In [226]:
x = np.array([[1., 2., 3.], [4., 5., 6.]])
y = np.array([[6., 23.], [-1, 7], [8, 9]])
x.dot(y)

array([[ 28.,  64.],
       [ 67., 181.]])

In [227]:
np.dot(x, y)

array([[ 28.,  64.],
       [ 67., 181.]])

In [228]:
np.dot(x, np.ones(3))

array([ 6., 15.])

#### The @ symbol
`@` works as an infix operator that performs matrix multiplication

In [229]:
x @ np.ones(3)

array([ 6., 15.])

### numpy.linalg 
`numpy.linalg ` library has a standard set of matrix decompositions and things like inverse and determinant
- implemented under the hood via the same industry-standard linear algebra libraries

In [231]:
from numpy.linalg import inv, qr
X = np.random.randn(5, 5)
mat = X.T.dot(X)
inv(mat)
mat.dot(inv(mat))
q, r = qr(mat)
r

array([[-9.6581,  2.9842,  4.0613,  2.4873,  3.939 ],
       [ 0.    , -3.2092, -0.0879, -0.537 , -2.3627],
       [ 0.    ,  0.    , -1.185 ,  0.1753, -0.1581],
       [ 0.    ,  0.    ,  0.    , -3.9604, -1.0656],
       [ 0.    ,  0.    ,  0.    ,  0.    ,  1.689 ]])