# NUMERICAL PYTHON

## Motivation to use numpy
- Dynamic typing in python provides flexibility but affects the memory and efficiency. Numpy and Pandas address these issues.
- Vectorized operations not supported in Python due to the need for dynamic type inference. Numpy and Pandas support these operations. Vectorized operations are preferred over loops.


## Importing and verifying the version

In [4]:
import numpy

In [5]:
numpy.__version__

'1.15.1'

## Import as alias

In [6]:
import numpy as np

In [7]:
np.__version__

'1.15.1'

## Builtin documentation

In [7]:
# np.<tab>

In [8]:
np?

In [10]:
np.add?

## Dynamic type interference

Pros:
- The context is switched from integer to string

Cons:
- The size of the object will change based on the inferred type
- Extra information should be stored in a data structure
- Simple integer requires 28 bytes

In [2]:
from sys import getsizeof as byte_size

In [3]:
x = 1
byte_size(x)

28

In [14]:
# Extra information stored for the integer object

dir (x)

['__abs__',
 '__add__',
 '__and__',
 '__bool__',
 '__ceil__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__float__',
 '__floor__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__index__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__le__',
 '__lshift__',
 '__lt__',
 '__mod__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdivmod__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rfloordiv__',
 '__rlshift__',
 '__rmod__',
 '__rmul__',
 '__ror__',
 '__round__',
 '__rpow__',
 '__rrshift__',
 '__rshift__',
 '__rsub__',
 '__rtruediv__',
 '__rxor__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__truediv__',
 '__trunc__',
 '__xor__',
 'bit_length',
 'conjugate',
 'denominator',
 'from_bytes',
 'imag',
 'numerator',
 'real',
 'to_bytes']

In [15]:
x = 'twenty'

In [16]:
byte_size(x)

55

In [17]:
dir(x)

['__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mod__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmod__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'capitalize',
 'casefold',
 'center',
 'count',
 'encode',
 'endswith',
 'expandtabs',
 'find',
 'format',
 'format_map',
 'index',
 'isalnum',
 'isalpha',
 'isascii',
 'isdecimal',
 'isdigit',
 'isidentifier',
 'islower',
 'isnumeric',
 'isprintable',
 'isspace',
 'istitle',
 'isupper',
 'join',
 'ljust',
 'lower',
 'lstrip',
 'maketrans',
 'partition',
 'replace',
 'rfind',
 'rindex',
 'rjust',
 'rpartition',
 'rsplit',
 'rstrip',
 'split',
 'splitlines',
 'startswith',
 'strip',
 'swapcase',
 'title',
 'translate',
 'upper',


In [19]:
my_list = [1,"two", 4.8, True]
my_list_types = [type(x) for x in my_list]

In [20]:
my_list_types

[int, str, float, bool]

In [21]:
byte_size(my_list)

96

In [22]:
sum?

In [40]:
sum([byte_size(x) for x in my_list])

132

## Memory: Python List vs NumPy Array

NumPy Array
- Contiguous memory space
- All items of the same type
- Less memory footprint. Redundant info is not stored.
- Efficient storage and manipulation.
- Control over how much memory each item will take(int8, int16)
- An array contains a single pointer to one contiguous block of data

Python List
- Dynamic type
- Non contiguous memory
- Heterogeneous items are separated
- Each item takes more memory to support dynamic typing. Each item contains its own information. type_info, reference count, etc
- Contains a pointer to a block of pointers. Each pointer points to an object.

In [41]:
plist = [1,2,3,4,5,6]
print(byte_size(plist))
print(sum([byte_size(x) for x in plist]))

112
168


In [42]:
print([byte_size(x) for x in plist])

[28, 28, 28, 28, 28, 28]


In [56]:
# int8, int16, int32, int64 are supported

nparray = np.array(plist, dtype='int8')

In [57]:
nparray

array([1, 2, 3, 4, 5, 6], dtype=int8)

In [58]:
[item.nbytes for item in nparray]

[1, 1, 1, 1, 1, 1]

In [60]:
# Get the id for every item in the np array

[id(item) for item in nparray]

[139907889945864,
 139907889945888,
 139907889945864,
 139907889945888,
 139907889945864,
 139907889945888]

## NumPy array attributes.

- .ndim - number of dimensions
- .size - number of elements in the array or along the specified axis
- .shape - tuple of dimensions
- .axis - dimension of interest

In [8]:
l = [1,2,3,4,5,6]

In [9]:
a = np.array(l)

In [10]:
a

array([1, 2, 3, 4, 5, 6])

In [14]:
a.ndim

1

In [17]:
a.size

6

In [23]:
np.size(a, axis = 0)

6

In [24]:
a.shape

(6,)

In [30]:
b = a

In [31]:
b.reshape(3,2)

array([[1, 2],
       [3, 4],
       [5, 6]])

In [36]:
b = b.reshape(3,2)

In [37]:
b.ndim

2

In [38]:
b.size

6

In [39]:
b.shape

(3, 2)

In [46]:
np.size(b, axis = 0) # 0 - rows

3

In [45]:
np.size(b, axis = 1) # 1 - columns

2

In [25]:
arr = np.array([[1,2,3],[4,5,6]])

In [26]:
arr

array([[1, 2, 3],
       [4, 5, 6]])

In [27]:
print(arr.ndim)
print(np.ndim(arr))

2
2


In [67]:
print(arr.shape)
print(np.shape(arr))

(2, 3)
(2, 3)


In [70]:
print(arr.size)
print(np.size(arr))

6
6


In [47]:
np.array?

In [48]:
np.shape?

In [49]:
np.size?

In [50]:
np.ndim?

In [52]:
np.reshape?

# Creating a numpy array

## Empty
- uninitialized array with whatever contents that are in the memory

In [56]:
a = np.empty(shape = (2,3), dtype = float, order = 'F') 

In [57]:
a

array([[3.31033942e-033, 4.66263921e-086, 6.01433264e+175],
       [2.99645216e-066, 3.35759734e-143, 6.93885958e+218]])

## From scratch with predefined values

### Filled with zeros

In [61]:
a = np.zeros(10, dtype=int)

In [62]:
a

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [63]:
np.zeros?

### Filled with ones

In [64]:
b = np.ones(10)

In [65]:
b

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [66]:
b = np.ones(10, dtype = int)

In [67]:
b

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [68]:
b = np.ones(10, dtype = int, order = 'C')

In [69]:
b

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [None]:
no.ones?

### With default values

In [71]:
c = np.full?

In [72]:
c = np.full(shape=(10), fill_value=10)

In [73]:
c

array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10])

In [74]:
c = np.full(shape=(5,2), fill_value=10)

In [75]:
c

array([[10, 10],
       [10, 10],
       [10, 10],
       [10, 10],
       [10, 10]])

### Identity matrix

In [79]:
np.eye(3,dtype=int)

array([[1, 0, 0],
       [0, 1, 0],
       [0, 0, 1]])

In [78]:
np.eye?

## From list objects

In [80]:
a = np.array([1,2,3,'Hello'])

In [81]:
a

array(['1', '2', '3', 'Hello'], dtype='<U21')

In [82]:
b = np.array([1,2,3,4])

In [83]:
b

array([1, 2, 3, 4])

In [85]:
c = np.array([[1,2,3,4],[5,6,7,8]])

In [86]:
c

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

In [89]:
# This will create an array of lists not a matrix

c = np.array([[1,2,3,4],[5,6,7]])

In [88]:
c

array([list([1, 2, 3, 4]), list([5, 6, 7])], dtype=object)

In [92]:
d = [range(i, i + 3) for i in [2,4,6]]

In [93]:
e = np.array(d)

In [94]:
e

array([[2, 3, 4],
       [4, 5, 6],
       [6, 7, 8]])

## Array with linear sequance of values in range

#### np.arange() is the numpy equivalent of range()

In [95]:
np.arange?

In [102]:
a = np.arange(0,10,step = 2, dtype=int)

In [103]:
a

array([0, 2, 4, 6, 8])

In [104]:
b = np.arange(10,0,step = -1, dtype=int)

In [105]:
b

array([10,  9,  8,  7,  6,  5,  4,  3,  2,  1])

#### np.linspace() array of evenly spaced values in the range

In [106]:
np.linspace?

In [112]:
np.linspace(0,1,3, dtype = float)

array([0. , 0.5, 1. ])

## Array of random numbers

### Uniform distribution of floats from [0.0, 1.0]

In [116]:
np.random.random(10)

array([0.57971642, 0.55116972, 0.89785902, 0.2544817 , 0.42693444,
       0.56993594, 0.20633597, 0.04811851, 0.15344864, 0.12202174])

In [120]:
np.random.random(size = (2,3))

array([[0.80017612, 0.40287922, 0.42060412],
       [0.64690315, 0.91761552, 0.51589915]])

### Normal distribution
- loc - mean
- scale - std deviation(width)

In [124]:
np.random.normal(loc=2,scale = 2, size = 10)

array([3.52682395, 1.41599432, 1.57605524, 1.16437636, 0.97869334,
       3.01780074, 0.01008538, 2.80383792, 4.24019791, 1.98624762])

In [125]:
np.random.normal(loc=2,scale = 2, size = (3,2))

array([[ 0.04730499,  1.86003012],
       [ 0.47688471, -1.52356068],
       [ 0.14971043,  2.09571249]])

### Random numbers from an interval

In [126]:
np.random.randint?

In [128]:
np.random.randint(0,10, size = (2,1))

array([[1],
       [7]])

In [130]:
np.random.randint(0,10, size = 10)

array([0, 6, 6, 4, 7, 3, 2, 5, 8, 5])

## Using builtin ADTs

### Tuples

In [134]:
t = (1,2,3)

In [135]:
a = np.array(t)

In [136]:
a

array([1, 2, 3])

### Sets

In [138]:
s = {1,2,3,4,5}

In [139]:
a = np.array(s)

In [140]:
a

array({1, 2, 3, 4, 5}, dtype=object)

### Dictionary

In [141]:
d = {'language':'python'}

In [142]:
a = np.array(d)

In [143]:
a

array({'language': 'python'}, dtype=object)

# Order - Column wise Vs. Row wise
- Order = 'F' - Store multidimensional data in a Fortran contiguous memory, i.e. column major. Use this when access within a column is frequent
- Order = 'C' - Store multidimensional data in a C contiguous memory, i.e. row major. Use this when access within a row is frequent

In [157]:
a = [range(i,i + 4) for i in [2,3,5,4]]

In [158]:
a

[range(2, 6), range(3, 7), range(5, 9), range(4, 8)]

In [159]:
x = np.array(a, order = 'C')

In [160]:
x

array([[2, 3, 4, 5],
       [3, 4, 5, 6],
       [5, 6, 7, 8],
       [4, 5, 6, 7]])

In [161]:
y = np.array(a, order = 'F')

In [162]:
y

array([[2, 3, 4, 5],
       [3, 4, 5, 6],
       [5, 6, 7, 8],
       [4, 5, 6, 7]])

In [163]:
x.shape

(4, 4)

In [164]:
x.shape[0]

4

In [165]:
x.shape[1]

4

In [171]:
np.min(x, axis = 0)

array([2, 3, 4, 5])

In [172]:
np.max(x, axis = 0)

array([5, 6, 7, 8])

# Loops vs Vectorized operations

Vectorized operations:
- The same operation is applied to each element of the array or each subarray
- e.g., the sum of all elements in each row of the 2-d array
- Generally implemented through NumPy's universal functions (ufuncs):
- e.g., np.sum(), np.multiply()

Loops are slow for such operations:
Order of magnitude performance degradation

In [173]:
from timeit import default_timer as time

In [192]:
x = 1000
a = np.array([range(i, i + x) for i in list(range(1, x + 1))], order = 'F')
b = np.array([range(i, i + x) for i in list(range(1, x + 1))], order = 'C')

In [193]:
ncol = a.shape[0]
ncol

1000

In [194]:
nrow = a.shape[1]
nrow

1000

In [203]:
rsum = np.zeros (nrow, dtype = int)

start = time()
for i in range (0, nrow):
    rsum[i] = 0
    for j in range (0, ncol):
        rsum [i] = rsum[i] + a[i,j]
end = time() 
print("Row sum time for F-array: {0:1.2E} sec".format(end-start))


start = time()
for i in range (0, nrow):
    rsum[i] = 0
    for j in range (0, ncol):
        rsum [i] = rsum[i] + b[i,j]
end = time() 
print("Row sum time for C-array: {0:1.2E} sec".format(end-start))

Row sum time for F-array: 4.68E-01 sec
Row sum time for C-array: 3.91E-01 sec


In [201]:
csum = np.zeros (ncol, dtype = int)

start = time()
for j in range (0, ncol):
    csum[j] = 0
    for i in range (0, nrow):
        csum [j] = csum[j] + a[i,j]
end = time() 
print("Column sum time for F-array: {0:1.2E} sec".format(end-start))


start = time()
for j in range (0, ncol):
    csum[j] = 0
    for i in range (0, nrow):
        csum [j] = csum[j] + b[i,j]
end = time() 
print("Column sum time for C-array: {0:1.2E} sec".format(end-start))
#csum

Column sum time for F-array: 4.54E-01 sec
Column sum time for C-array: 4.13E-01 sec


## Vectorized sum

In [211]:
how_big = 1000
fa = np.array ( [range(i, i+how_big) for i in list (range (1, how_big + 1))], 
                      order = 'F')
ca = np.array ( [range(i, i+how_big) for i in list (range (1, how_big + 1))], 
                      order = 'C')
nrow = fa.shape[0]
ncol = fa.shape[1]

In [212]:
%timeit np.sum(fa, axis = 0)
%timeit np.sum(fa, axis = 1)

717 µs ± 30.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
536 µs ± 10.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [213]:
%timeit np.sum(ca, axis = 0)
%timeit np.sum(ca, axis = 1)

560 µs ± 66.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
732 µs ± 30.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


### Coercion: Upcasting
Implicit coercion: If the types of array elements doe NOT match, then NumPy will upcast if possible.
dtype =: Explicitly set the type

In [215]:
np.array([1,2,1.9])

array([1. , 2. , 1.9])

In [216]:
np.array([1.1,'2'])

array(['1.1', '2'], dtype='<U32')

# INDEXING, CLICING, STRIDING, NO COPY VIEW

## Key Points

#### Indexing
- Index is an integer
- Index counting starts from 0
- Negative indices are ok
- Specified in [] brackets
- range of index values for target axis_value:
    - positive - np.size(a, axis = axis_value) - 1
    - negative - -np.size(a, axis = axis_value)
- For multidimensional - comma separated indices for each axis


#### Slicing and striding
- [start:stop:step] square brackets specifying the pattern
- each element given above is optional 
- negative are allowed
- rules fo range are same as index
- for multidimensional slice: a comma-separated tuple of subarray access patterns along each axis
- [start, stop) half open interval

#### Subarray is a view and not a copy
- changes in the sub array change the main array
- np.copy() to explicitly create a copy of the subaray to prevent the modification of the original array.

## Indexing Single elements

### Zero based indexing

In [223]:
a = np.array([1,2,3,4,5,6,7,8,9])

In [224]:
a[0]

1

In [225]:
a[8]

9

In [227]:
b = a.reshape?

In [228]:
b = a.reshape((3,3))

In [229]:
b

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [230]:
b[0,0]

1

In [231]:
b[2,2]

9

### Negative indexing

In [236]:
b[-2,-3]

4

In [237]:
b[-1,-1]

9

### Index range

In [239]:
a = np.arange(1,10)

In [240]:
a

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [241]:
a.size

9

In [242]:
a[0]

1

In [243]:
a[np.size(a)-1]

9

In [244]:
a[-2] == a[np.size(a) - 2]

True

In [245]:
a[np.size(a)]

IndexError: index 9 is out of bounds for axis 0 with size 9

In [247]:
b[np.size(b, axis=0) - 1]

array([7, 8, 9])

In [248]:
b[-np.size(b, axis = 0)]

array([1, 2, 3])

### Multidimensional indexing

In [253]:
m = np.random.randint(1,10, size = (3,4))

In [254]:
m

array([[7, 3, 4, 7],
       [3, 5, 4, 2],
       [1, 7, 3, 8]])

In [255]:
m[0,0]

7

In [256]:
m[2,3]

8

In [260]:
m[np.size(m, axis = 0)-1, np.size(m, axis = 1) -1]

8

In [261]:
m[-1,-1]

8

In [262]:
m[-np.size(m, axis = 0), -np.size(m, axis = 1)]

7

## Slicing and Striding

### 1 D subarray

In [263]:
a = np.arange(10)

In [264]:
a

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [266]:
a[4:9]

array([4, 5, 6, 7, 8])

In [267]:
a[::-1]

array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])

In [269]:
a[1::2]

array([1, 3, 5, 7, 9])

In [270]:
a[::2]

array([0, 2, 4, 6, 8])

### Half open interval

In [277]:
a = np.arange(0,5)
b = np.arange(5,11)

In [279]:
c = np.concatenate((a,b))

In [275]:
c = np.concatenate([a,b])

In [276]:
c

array([ 0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

## Multidimensional subarray

In [280]:
a = np.arange(12)

In [281]:
a

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [283]:
b = a.reshape(4,3)

In [284]:
b

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [285]:
b[:,:1]

array([[0],
       [3],
       [6],
       [9]])

In [286]:
b[1:][:]

array([[ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [287]:
b[::-1,::-1]

array([[11, 10,  9],
       [ 8,  7,  6],
       [ 5,  4,  3],
       [ 2,  1,  0]])

### Accessing rows or columns

In [290]:
b[:,0]

array([0, 3, 6, 9])

In [291]:
b[0,:]

array([0, 1, 2])

## Subarray as No-Copy view

In [293]:
a = np.arange(12)

In [294]:
a

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [295]:
b = a.reshape(3,4)

In [296]:
b

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [306]:
c = b[1:,2:]

In [307]:
c

array([[ 6,  7],
       [10, 11]])

In [309]:
c[0,0] = 999

In [310]:
b

array([[  0,   1,   2,   3],
       [  4,   5, 999,   7],
       [  8,   9,  10,  11]])

In [311]:
d = np.copy?

In [316]:
d = np.copy(c)

In [317]:
d

array([[999,   7],
       [ 10,  11]])

In [318]:
d[0,0] = 1111

In [319]:
d

array([[1111,    7],
       [  10,   11]])

In [320]:
c

array([[999,   7],
       [ 10,  11]])

In [321]:
b

array([[  0,   1,   2,   3],
       [  4,   5, 999,   7],
       [  8,   9,  10,  11]])