# Programming for Data Science (Python)

In [10]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

<p style="font-family: Arial; font-size:1.75em;color:#2462C0; font-style:bold"><br>

Create Rank 1 numpy arrays:
</p>

In [16]:
import numpy as np
an_array = np.array([3,33,333])
an_array
type(an_array)

array([  3,  33, 333])

numpy.ndarray

In [7]:
# test the shape of the array we just created, it should have just one dimension (Rank 1)
an_array.shape
# because this is a 1-rank array, we need only one index to accesss each element
an_array[0]
# ndarrays are mutable, here we change an element of the array
an_array[0] = 88
an_array

(3,)

3

array([ 88,  33, 333])

<p style="font-family: Arial; font-size:1.75em;color:#2462C0; font-style:bold"><br>

Create a Rank 2 numpy array:</p>

A rank 2 **ndarray** is one with two dimensions.  Notice the format below of [ [row] , [row] ].  2 dimensional arrays are great for representing matrices which are often useful in data science.

In [10]:
# Create a rank 2 array
another = np.array([[11,12,13],[21,22,23]])
another
another.shape
another[0,0]
another[1,2]

array([[11, 12, 13],
       [21, 22, 23]])

(2, 3)

11

23

In [9]:
another2 = np.array([[3,33,333]])
another2
another2.shape

array([[  3,  33, 333]])

(1, 3)

<p style="font-family: Arial; font-size:1.75em;color:#2462C0; font-style:bold"><br>

There are many way to create numpy arrays:
</p>

Here we create a number of different size arrays with different shapes and different pre-filled values.  numpy has a number of built in methods which help us quickly and easily create multidimensional arrays.

In [22]:
# create a 2x2 array of zeros
ex1 = np.zeros((2,2))
ex1
# create a 2x2 array filled with 9.0
ex2 = np.full((2,2),9.0)
print(ex2)
# create an array of ones
ex3 = np.ones((1,2))
ex3


array([[0., 0.],
       [0., 0.]])

[[9. 9.]
 [9. 9.]]


array([[1., 1.]])

In [14]:
# notice that the above ndarray (ex3) is actually rank 2, it is a 2x1 array
ex3.shape

# which means we need to use two indexes to access an element
ex3[0,0]

(1, 2)

1.0

In [15]:
# create an array of random floats between 0 and 1
ex4 = np.random.random((2,2))
ex4

array([[ 0.29421478,  0.28746733],
       [ 0.32953505,  0.82803003]])

# Datatypes

In [16]:
# Python assigns the  data type
ex1.dtype

dtype('float64')

In [17]:
# Python assigns the  data type
ex5 = np.array([1,2,3])
ex5.dtype

dtype('int32')

In [18]:
#You can also tell Python the  data type
ex5 = np.array([1,2,3], dtype = np.int64)
ex5.dtype

dtype('int64')

In [19]:
# you can use this to force floats into integers (using floor function)
ex6 = np.array([1.5,2.2,3.8], dtype = np.int64)
ex6

array([1, 2, 3], dtype=int64)

In [None]:
# you can use this to force integers into floats if you anticipate
# the values may change to floats later


# Array Indexing

<p style="font-family: Arial; font-size:1.75em;color:#2462C0; font-style:bold"><br>
Slice indexing:
</p>

Similar to the use of slice indexing with lists and strings, we can use slice indexing to pull out sub-regions of ndarrays.

In [30]:
# Rank 2 array of shape (3, 4)
an_array = np.array([[11,12,13,14],[21,22,23,24],[31,32,33,34]])
an_array
#Use array slicing to get a subarray consisting of 2 rows x 2 columns.
a_slice = an_array[:2,1:3]
a_slice
#When you modify a slice, you actually modify the underlying array.
print("before: ", an_array[0,1])
a_slice[0,0] = 1000
print("after: ", an_array[0,1])
#To avoid that, you need to explicitly use the np.array()function.
another_slice = np.array(an_array[:2,1:3])

array([[11, 12, 13, 14],
       [21, 22, 23, 24],
       [31, 32, 33, 34]])

array([[12, 13],
       [22, 23]])

before:  12
after:  1000


array([[  11, 1000,   13,   14],
       [  21,   22,   23,   24]])

In [40]:
# You may generate an array of lower rank
row_rank1 = an_array[1]
row_rank1.shape
row_rank1
# Or an array of the same rank as the an_array
row_rank2= an_array[:2,:]
row_rank2.shape
row_rank2
#We can do the same thing for columns of an array:



(4,)

array([21, 22, 23, 24])

(2, 4)

array([[  11, 1000,   13,   14],
       [  21,   22,   23,   24]])

# Fancy indexing: array of indices

Sometimes it's useful to use an array of indexes to access or change elements.

In [25]:
# Create a new array
array1 = np.array([[11,12,13],[21,22,23],[31,32,33],[41,42,43]])
array1

array([[11, 12, 13],
       [21, 22, 23],
       [31, 32, 33],
       [41, 42, 43]])

In [26]:
# Create an array of indices
col_indices = np.array([0,1,2,0])
row_indices = np.arange(4)

In [27]:
# Examine the pairings of row_indices and col_indices.  These are the elements we'll change next.
for row, col in zip(row_indices, col_indices):
    print(row, ', ',col)

0 ,  0
1 ,  1
2 ,  2
3 ,  0


In [28]:
# Select one element from each row
print(array1[row_indices, col_indices])

[11 22 33 41]


In [29]:
# Change one element from each row using the indices selected
array1[row_indices, col_indices]+=1000
array1

array([[1011,   12,   13],
       [  21, 1022,   23],
       [  31,   32, 1033],
       [1041,   42,   43]])

# Boolean Indexing

In [30]:
# create a filter which will be boolean values for whether each element meets this condition
filter = (array1 > 15)
filter

array([[ True, False, False],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]], dtype=bool)

Notice that the filter is a same size ndarray as an_array which is filled with True for each element whose corresponding element in an_array which is greater than 15 and False for those elements whose value is less than 15.

In [31]:
# we can now select just those elements which meet that criteria
print(array1[filter])

[1011   21 1022   23   31   32 1033 1041   42   43]


In [33]:
# For short, we could have just used the approach below without the need for the separate filter array.
array1[array1%2 == 0]

array([  12, 1022,   32,   42])

What is particularly useful is that we can actually change elements in the array applying a similar logical filter.  Let's add 100 to all the even values.

In [34]:
array1[array1%2 == 0]+=100
array1

array([[1011,  112,   13],
       [  21, 1122,   23],
       [  31,  132, 1033],
       [1041,  142,   43]])

# Arithmetic Array Operations:


In [41]:
x = np.array([[1,2],[12,22]], dtype=np.int)
y = np.array([[21.1,22.1],[1.1,2.1]], dtype=np.float64)
x
y

array([[ 1,  2],
       [12, 22]])

array([[21.1, 22.1],
       [ 1.1,  2.1]])

In [36]:
#plus
print(x+y)
print(np.add(x,y))

[[ 22.1  24.1]
 [ 13.1  24.1]]
[[ 22.1  24.1]
 [ 13.1  24.1]]


In [37]:
# subtract
print(x-y)
print(np.subtract(x,y))

[[-20.1 -20.1]
 [ 10.9  19.9]]
[[-20.1 -20.1]
 [ 10.9  19.9]]


In [38]:
# multiply
print(x*y)
print(np.multiply(x,y))

[[ 21.1  44.2]
 [ 13.2  46.2]]
[[ 21.1  44.2]
 [ 13.2  46.2]]


In [39]:
# divide
print(x/y)
print(np.divide(x,y))

[[  0.04739336   0.09049774]
 [ 10.90909091  10.47619048]]
[[  0.04739336   0.09049774]
 [ 10.90909091  10.47619048]]


In [40]:
# square root
print(np.sqrt(x))

[[ 1.          1.41421356]
 [ 3.46410162  4.69041576]]


In [41]:
# exponent (e ** x)
print(np.exp(x))

[[  2.71828183e+00   7.38905610e+00]
 [  1.62754791e+05   3.58491285e+09]]


<p style="font-family: Arial; font-size:1.75em;color:#2462C0; font-style:bold"><br>

Let's explore the efficiency of universal functions

</p>

In [42]:
# Using loop to compute the reciprocal of each element of an array
np.random.seed(0)
def compute_reciprocals(values):
    output = np.empty(len(values))
    for i in range(len(values)):
        output[i]=1.0/values[i]
    return output

rarray = np.random.randint(1,10,size = 5)
compute_reciprocals(rarray)

array([ 0.16666667,  1.        ,  0.25      ,  0.25      ,  0.125     ])

In [43]:
big_array = np.random.randint(1,100,size = 1000000)
%time compute_reciprocals(big_array)

Wall time: 1.27 s


array([ 0.1       ,  0.01190476,  0.04545455, ...,  0.01428571,
        0.01098901,  0.01149425])

In [44]:
%time (1/big_array)

Wall time: 0 ns


array([ 0.1       ,  0.01190476,  0.04545455, ...,  0.01428571,
        0.01098901,  0.01149425])

In [None]:
numbers = [1,2,3,4,5,45,67,69]
count_even = 0
count_odd = 0
for i in numbers:
    if i%2 == 0:
        count_even += 1
    else:
        count_odd += 1

# Aggregation functions

In [43]:
# setup a random 2 x 4 matrix
array2 = np.random.randn(2,4)
array2

array([[ 0.72435371,  1.27860881, -0.58547408,  1.56909645],
       [-0.62421091,  0.12853309,  0.54162094,  0.96211669]])

In [47]:
# compute the mean for all elements
array2.mean()

0.30422290864076057

In [49]:
# compute the means by row
array2.mean(axis = 1)

array([ 0.45894956,  0.14949626])

In [50]:
# compute the means by column
array2.mean(axis = 0)

array([ 1.00142515,  0.40591398,  0.41043206, -0.60087955])

In [44]:
# sum all the elements
array2.sum()

3.994644697387896

In [51]:
# compute the medians
np.median(array2, axis = 1)

array([ 0.47104397,  0.11887716])

In [45]:
np.median(array2, axis = 0)

array([ 0.0500714 ,  0.70357095, -0.02192657,  1.26560657])

In [52]:
#sorting
# create a 10 element array of randoms
array3 = np.random.randn(10)
print(array3)
array3.sort()
print(array3)

[-0.81585044 -0.09495964 -0.11991924 -1.27360037 -0.20817263 -1.00912111
 -1.60382836 -1.22308313  0.42019376  0.49824839]
[-1.60382836 -1.27360037 -1.22308313 -1.00912111 -0.81585044 -0.20817263
 -0.11991924 -0.09495964  0.42019376  0.49824839]


In [53]:
#Find unique elements
array4 = np.array([1,2,4,5,2,4,5])
np.unique(array4)

array([1, 2, 4, 5])

# Broadcasting:

Introduction to broadcasting. <br>
For more details, please see: <br>
https://docs.scipy.org/doc/numpy-1.10.1/user/basics.broadcasting.html

In [55]:
#Create a 4X3 array
start = np.zeros((4,3))
start

array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

In [56]:
# create a rank 1 ndarray with 3 values
add_rows = np.array([1,0,2])
add_rows.shape

(3,)

In [57]:
#Add together
y = start+add_rows
y

array([[ 1.,  0.,  2.],
       [ 1.,  0.,  2.],
       [ 1.,  0.,  2.],
       [ 1.,  0.,  2.]])

In [58]:
# create an ndarray which is 4 x 1 to broadcast across columns
add_cols = np.array([[0,1,2,3]])
add_cols = add_cols.transpose()
add_cols

array([[0],
       [1],
       [2],
       [3]])

In [59]:
# add to each column of 'start' using broadcasting
y = start + add_cols
y

array([[ 0.,  0.,  0.],
       [ 1.,  1.,  1.],
       [ 2.,  2.,  2.],
       [ 3.,  3.,  3.]])

In [60]:
# this will just broadcast in both dimensions
add_scalar = np.array([1])
print(start+add_scalar)

[[ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 1.  1.  1.]]


In [62]:
# create our 3x4 matrix
start1 = np.zeros((3,4))
start1

array([[ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.]])

In [63]:
# create our (4,) array
addA = np.array([1,2,3,4])
addA

array([1, 2, 3, 4])

In [64]:
# add the two together using broadcasting
b = start1+addA
b

array([[ 1.,  2.,  3.,  4.],
       [ 1.,  2.,  3.,  4.],
       [ 1.,  2.,  3.,  4.]])

In [65]:
#Application of broadcasting - centering an array
X = np.random.random((10,3))
Xmean = X.mean(axis = 0)
Xmean
X_centered = X-Xmean
X_centered

array([ 0.6888451 ,  0.42462478,  0.60378153])

array([[ 0.30220871, -0.14324592, -0.58612966],
       [-0.0328593 ,  0.19302938, -0.4240501 ],
       [ 0.15865028,  0.04357788,  0.37080955],
       [-0.08581491, -0.11252546,  0.02627851],
       [ 0.25108006, -0.05571031,  0.26354846],
       [-0.23775539,  0.25351349,  0.2185732 ],
       [-0.47146868, -0.21004024,  0.15309102],
       [ 0.2031229 ,  0.49359209,  0.25567847],
       [-0.18165762, -0.31801883,  0.03716178],
       [ 0.09449395, -0.14417208, -0.31496123]])