### CIS 9 ###
### Numpy, Basic Statistics -- Solution ###

`Optional Reading`: 
<br>- Python Data Science Handbook: Ch 2 except Fancy Indexing, Structured Arrays sections
<br>- Think Stats: Ch 1; Ch 2 up to and including Variance; Ch 3: Percentiles

To use numpy, we first need to import the `numpy` module:

In [1]:
import numpy as np

**Initializing an array, size of an array**

In [2]:
# 1. 1D array from list
oneD = np.array([1, 2, 3, 4])
print(oneD)

# print the size of the array?
print(oneD.shape[0])
print(len(oneD))

[1 2 3 4]
4
4


In [3]:
# 2. 2D array from list of lists
twoD = np.array([[1,2,3],[4,5.1,6],[7,8,9],[10,11,12]])
print(twoD)    # all numbers converted to float

# print the size of the array?
print(twoD.shape)

# A 2D or higher dimension array must have the same number of elements across each dimension.
# Example: for a 2D array, all rows must have the same number of elements and
# all columns must have the same number of elements

[[ 1.   2.   3. ]
 [ 4.   5.1  6. ]
 [ 7.   8.   9. ]
 [10.  11.  12. ]]
(4, 3)


In [4]:
# 3. array of int zeros
zeros = np.zeros(8, dtype=int)   
print(zeros)
print(type(zeros[0]))

# what data type does the array store?  
#  we give numpy the type Python int, which gets converted to np.int32
# how to have the array store numpy's int?    
#  put numpy type directly: dtype=np.int32   or  np.int8, np.int16, etc.

[0 0 0 0 0 0 0 0]
<class 'numpy.int32'>


In [5]:
# 4. array of zeros
floatZeros = np.zeros((2,2))
print(floatZeros)

# what's the default data type for numpy?  numpy's float64
print(type(floatZeros))
print(type(floatZeros[0]))
print(type(floatZeros[0,0]))

[[0. 0.]
 [0. 0.]]
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.float64'>


In [6]:
# 5. array of ones
ones = np.ones(3)
print(ones)

[1. 1. 1.]


In [7]:
# 6. array of same values
filled = np.full((2,3),-2) 
print(filled)

# what is (2,3)?  the shape of the array, 2 rows x 3 cols

[[-2 -2 -2]
 [-2 -2 -2]]


In [8]:
# 7. array of integers, in a range
countingNums = np.arange(1,11)   # similar to Python range()
                                 # but you end up with a numpy array
print(countingNums)
every2 = np.arange(10,0,-2)
print(every2)

# are the upper and lower limits of np.arange() the same 
# as Python's range()?   yes

[ 1  2  3  4  5  6  7  8  9 10]
[10  8  6  4  2]


In [9]:
# 8. array of float random numbers, always between 0 and 1
randNums = np.random.random((3,2))
print(randNums)

[[0.16557489 0.22780385]
 [0.22250265 0.89246046]
 [0.97147116 0.39392909]]


In [10]:
# 9. array of 80 int random numbers from 10 to 19
intRandNums = np.random.randint(10, 20, 80)
print(intRandNums)
print(intRandNums.shape, '\n')

# create a 3 rows x 4 cols array of random integers from 10 to 19?  
# print the array and the shape?
intRandNums2 = np.random.randint(10, 20, (3,4))
print(intRandNums2)
print(intRandNums2.shape)

[19 13 18 13 18 16 16 12 12 17 13 11 10 18 12 15 18 18 10 16 10 12 17 19
 10 15 15 10 12 19 15 12 19 13 15 13 12 12 12 17 14 14 10 18 18 16 19 15
 13 16 13 15 16 19 19 10 12 12 18 12 11 14 15 17 12 18 19 18 12 12 11 17
 14 17 13 14 16 13 18 16]
(80,) 

[[10 14 19 13]
 [12 10 11 17]
 [17 11 14 12]]
(3, 4)


In [11]:
# 10. array from csv file
import csv
import random

with open("sample.csv", "w", newline='') as f :
    writer = csv.writer(f)
    for i in range(3) :
        writer.writerow([random.randint(1,11) for i in range(4)])
        
data = np.loadtxt("sample.csv", delimiter=",")
print(data.shape)
print(data,'\n')

data = np.loadtxt("sample.csv", delimiter=",", dtype=np.int8)
print(data.shape)
print(data, '\n')

with open("sample.txt", "w") as f :
    f.write("one two three")
data = np.genfromtxt("sample.txt", dtype=str)   # space delimiter
print(data)
print(type(data))
print(type(data[0]), '\n')

with open("sample.txt", "w") as f :
    f.write("1,2,3")

# read sample.txt into a numpy array with 3 integer elements and print the array?
data = np.genfromtxt("sample.txt", dtype=int, delimiter=',')
print(data)

data = np.loadtxt("sample.txt", dtype=int, delimiter=",")
print(data.shape)
print(data, '\n')

# np.loadtxt works with a completely filled array, with no missing data
# np.genfromtxt has options to fill in any missing data with a default value

with open("sample.csv", "w", newline='') as f :
    writer = csv.writer(f)
    for i in range(3) :
        writer.writerow([random.randint(1,11) for i in range(4)])
    writer.writerow([1,2,'',''])   
    
#data = np.loadtxt("sample.csv", delimiter=',')
data = np.genfromtxt("sample.csv", delimiter=',', filling_values=0)
print(data)

(3, 4)
[[10.  8.  4.  2.]
 [ 9.  1.  8.  3.]
 [ 9.  2.  9.  7.]] 

(3, 4)
[[10  8  4  2]
 [ 9  1  8  3]
 [ 9  2  9  7]] 

['one' 'two' 'three']
<class 'numpy.ndarray'>
<class 'numpy.str_'> 

[1 2 3]
(3,)
[1 2 3] 

[[ 7.  2.  1.  5.]
 [ 6.  3. 11.  2.]
 [ 8.  7. 10.  8.]
 [ 1.  2.  0.  0.]]


**Array indexing**

In [12]:
# 11. numeric indexing
arr = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])
print(arr, '\n')
print(arr[2], '\n')
print(arr[2,3], '\n')     # don't forget: [row,col]
print(arr[-1], '\n')
print(arr[-2,-2])

[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]] 

[ 9 10 11 12] 

12 

[ 9 10 11 12] 

7


In [13]:
# 12. slice indexing
arr = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])
print(arr, '\n')
print(arr[:2], '\n')
print(arr[1:,1:3], '\n')

# we can also mix integer indexing with slice indexing,
# however, this will yield an array of lower rank than the original array
print(arr[-1,:3], '\n')    # 1D, lower rank than original arr
print(arr[:-2,1:-1])       # 2D, same rank as arr

[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]] 

[[1 2 3 4]
 [5 6 7 8]] 

[[ 6  7]
 [10 11]] 

[ 9 10 11] 

[[2 3]]


In [14]:
# 13. Each time we create a slice of an array we get a view into the same array. 
# We're not creating a new array, so modifying it will modify the original array.
arr = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])
print(arr)
view = arr[:2,1:3]
print(view)
view[0,0] = 100
print(view)
print(arr, '\n')

# to actually make a copy, use the copy() method:  
copy = arr[:2,1:3].copy()
copy[0,0] = -1
print(copy)
print(arr)

# copying takes up memory so using a view is preferable if:
#    a) data is for analysis only (no modification needed)
# or b) if data need to be changed but the original array must remain 
#       the same

[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]
[[2 3]
 [6 7]]
[[100   3]
 [  6   7]]
[[  1 100   3   4]
 [  5   6   7   8]
 [  9  10  11  12]] 

[[-1  3]
 [ 6  7]]
[[  1 100   3   4]
 [  5   6   7   8]
 [  9  10  11  12]]


In [15]:
# 14. index with an array
arr = np.array([1,12,3,4,8,5])
print(arr)
print(arr[[0,2,5]])
index = [0,2,1]
print(arr[index])

[ 1 12  3  4  8  5]
[1 3 5]
[ 1  3 12]


In [16]:
# 15. boolean indexing
arr = np.array([[1,12,3,4], [15,6,7,10], [2,20,8,1]])
print(arr, '\n')
print(arr[arr<10], '\n')
print(arr<10, '\n')
print(arr[arr%2==1], '\n')

# describe what the last print statement will print for any general array?
# print all elements that are odd numbers

[[ 1 12  3  4]
 [15  6  7 10]
 [ 2 20  8  1]] 

[1 3 4 6 7 2 8 1] 

[[ True False  True  True]
 [False  True  True False]
 [ True False  True  True]] 

[ 1  3 15  7  1] 



**Changing array shape**

In [17]:
# 16. change the shape of an array, as long as the new shape has the same number
# of elements
arr = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])    # 12 elements
newArr1 = arr.reshape((6,2))
newArr2 = arr.reshape((12,))
print(arr, '\n')
print(newArr1,'\n')
print(newArr2)

# will the following will work? why or why not?                 
# newArr3 = arr.reshape((1,))     No, arr of 1 element can't fit 12 values
# newArr4 = arr.reshape((2,5))    No, arr of 10 elements can't fit 12 values
# newArr5 = arr.reshape((3,5))    No, arr of 15 elements is too large

[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]] 

[[ 1  2]
 [ 3  4]
 [ 5  6]
 [ 7  8]
 [ 9 10]
 [11 12]] 

[ 1  2  3  4  5  6  7  8  9 10 11 12]


In [18]:
# checking to see if reshaping will still be a view of the original array
arr[0,0] = 100
print(arr)
print(newArr1)   # yes
print(newArr2)   # yes

[[100   2   3   4]
 [  5   6   7   8]
 [  9  10  11  12]]
[[100   2]
 [  3   4]
 [  5   6]
 [  7   8]
 [  9  10]
 [ 11  12]]
[100   2   3   4   5   6   7   8   9  10  11  12]


In [19]:
# 17. transpose a 2D array (matrix)
arr = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])    
print(arr)
print(arr.T, '\n')
arr = np.array([[1,2,3]])
print(arr)
print(arr.T, '\n')

[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]
[[ 1  5  9]
 [ 2  6 10]
 [ 3  7 11]
 [ 4  8 12]] 

[[1 2 3]]
[[1]
 [2]
 [3]] 



In [20]:
# checking to see if transposing will still be a view of the original array
arr = np.array([[1,2,3]])
arrT = arr.T
print(arr)
print(arrT)
print()
arr[-1,-1] = 100
print(arr)
print(arrT)         # yes

[[1 2 3]]
[[1]
 [2]
 [3]]

[[  1   2 100]]
[[  1]
 [  2]
 [100]]


In [21]:
# 18. reverse a 1D array:
arr = np.array([1,2,3,4,5])
print(arr)
print(arr[::-1])
print(arr)

[1 2 3 4 5]
[5 4 3 2 1]
[1 2 3 4 5]


In [22]:
# to store the reversed array
arr1 = arr[::-1]
print(arr1)

[5 4 3 2 1]


In [23]:
# what goes on here?
arr1[0] = 100
print(arr)
print(arr1)      # the reversed array is still a view of the original array

[  1   2   3   4 100]
[100   4   3   2   1]


**Array math**
<br>Take advantage of numpy's ufuncs below and do not loop.<br>
Do the optional reading so you can see the speed difference between ufuncs and loops.

In [24]:
# 19. basic math functions operate element-wise on arrays, and are available both as operator overloads 
# and as functions in the numpy module:

x = np.array([[1,2],[3,4]], dtype=float)
y = np.array([[5,6],[7,8]], dtype=float)

print(x)
print(y, '\n')
print(x+y)

[[1. 2.]
 [3. 4.]]
[[5. 6.]
 [7. 8.]] 

[[ 6.  8.]
 [10. 12.]]


In [25]:
# 20. other arithmetic operations between 2 arrays
print(x - y)
print(x * y)
print(x / y)
print(x // y)
print(x % y)

[[-4. -4.]
 [-4. -4.]]
[[ 5. 12.]
 [21. 32.]]
[[0.2        0.33333333]
 [0.42857143 0.5       ]]
[[0. 0.]
 [0. 0.]]
[[1. 2.]
 [3. 4.]]


In [26]:
# 21. arithmetic operations between an array and a scalar
x = np.array([[1,2],[3,4]], dtype=int)
print(x - 1)
print(x * 2)
print(x / 3)
print(x // 4)
print(x % 3)
print(x ** 2)

[[0 1]
 [2 3]]
[[2 4]
 [6 8]]
[[0.33333333 0.66666667]
 [1.         1.33333333]]
[[0 0]
 [0 1]]
[[1 2]
 [0 1]]
[[ 1  4]
 [ 9 16]]


In [27]:
# 22. square root
print(np.sqrt(x), '\n')

# absolute value
print(np.abs([-1,2]), '\n')

[[1.         1.41421356]
 [1.73205081 2.        ]] 

[1 2] 



**Aggregate functions:**

In [28]:
# 23. Math
#sum
arr = np.array([[1,2],[3,4]])
print(np.sum(arr))
print(np.sum(arr, axis=0))          # by column
print(np.sum(arr, axis=1), '\n')    # by row

# min
print(arr.min())
print(arr.min(0))
print(arr.min(1), '\n')

# describe what the 3 statements above print? 
# min of all values in array
# min of each column
# min of each row

# max
print(arr.max())
print(arr.max(axis=0))
print(arr.max(1), '\n')

10
[4 6]
[3 7] 

1
[1 2]
[1 3] 

4
[3 4]
[2 4] 



In [29]:
# 24. Statistics
arr = np.array([[1,4,-3,2], [7,-1,3,8]])

# mean: central tendency
print(arr.mean())
print(arr.mean(axis=0))
print(arr.mean(1), '\n')

# variance: spread
# standard deviation: spread from the mean
print(arr.std())
print(arr.std(axis=0))
print(arr.std(1), '\n')

# median: mid-point
print(np.median(arr))
print(np.median(arr, axis=0))
print(np.median(arr,1), '\n')

# percentile rank: percentage of values that are less than or equal to a given value
# percentile: value with a given percentile rank
print(np.percentile(arr,75))  # quartile
print(np.percentile(arr,50))
print(np.percentile(arr,25))     

2.625
[4.  1.5 0.  5. ]
[1.   4.25] 

3.4977671449083054
[3.  2.5 3.  3. ]
[2.54950976 3.56195171] 

2.5
[4.  1.5 0.  5. ]
[1.5 5. ] 

4.75
2.5
0.5


**Broadcasting**

In [30]:
# 25. broadcasting or extending an array happens during computation between 2 arrays of 
# different sizes, as long as the 2 arrays have specific dimensions that can be matched

arr = np.array([[1,2,3], [4,5,6]])
print(arr + 2)    # the 2 is broadcasted to:  [ [2,2,2], [2,2,2]]  so it can be added to arr

[[3 4 5]
 [6 7 8]]


**Boolean operations**

In [31]:
# 26. checking data in an array
arr = np.array([[1,2,-2],[-3,1,0]])
print(arr)

print(arr<0)

# review:
print(arr[arr<0], '\n')

print(np.sum(arr<0))
print(np.sum(arr<0,axis=0), '\n')

# describe the output the last 2 print statements above? 
# number of True values in boolean array, which means number of values < 0 in array
# number of True values in each column of the boolean array,
# which is the number of values < 0 in each column

print(np.any(arr<0))
print(np.all(arr<0), '\n')
print(np.all(arr<0,axis=1))

[[ 1  2 -2]
 [-3  1  0]]
[[False False  True]
 [ True False False]]
[-2 -3] 

2
[1 0 1] 

True
False 

[False False]


In [None]:
# check to see if using boolean indexing will result in a view
arr1 = arr[arr<1]
print(arr)
print(arr1)
print()
arr[0,0] = 100
print(arr)
print(arr1)      # no

**Sorting**

In [32]:
# 27. sort values in the array
arr = np.array([5,-2,0,2,-1,-2,4])
print(np.sort(arr),'\n')

arr = np.array([[2,0,-1],[1,8,3],[7,1,0]])
print(np.sort(arr), '\n')
print(np.sort(arr, axis=0), '\n')
print(np.sort(arr, axis=1), '\n')

# which axis is the default when no axis is specified?   
# axis = 1 (or across the rows)

[-2 -2 -1  0  2  4  5] 

[[-1  0  2]
 [ 1  3  8]
 [ 0  1  7]] 

[[ 1  0 -1]
 [ 2  1  0]
 [ 7  8  3]] 

[[-1  0  2]
 [ 1  3  8]
 [ 0  1  7]] 



**Get index values**

In [33]:
# 28. get the index of the sorted values

arr = np.array([5,-2,0,2,-1,-2,4])
print(np.argsort(arr))
ind = np.argsort(arr)
print(arr[ind],'\n')


arr = np.array([[2,0,-1],[1,8,3],[7,1,0]])
print("original arr:")
print(arr, '\n')

print("argsort's ind:")
print(np.argsort(arr), '\n')
ind = np.argsort(arr)

print("sorted array:")
print(np.sort(arr), '\n')   


[1 5 4 2 3 6 0]
[-2 -2 -1  0  2  4  5] 

original arr:
[[ 2  0 -1]
 [ 1  8  3]
 [ 7  1  0]] 

argsort's ind:
[[2 1 0]
 [0 2 1]
 [2 1 0]] 

sorted array:
[[-1  0  2]
 [ 1  3  8]
 [ 0  1  7]] 



In [34]:
# print the smallest value of each row by using arr and ind?
print(arr[0,ind[0,0]],arr[1,ind[1,0]],arr[2,ind[2,0]])
#     arr[0,2]        arr[1,0]         arr[2,2]

# a more general solution for any size array:
print(arr[np.arange(len(arr)), ind[:,0]])
#               [0,1,2]          [2,0,2]

-1 1 0
[-1  1  0]


In [35]:
# 29. get the indices that match a boolean condition
arr = np.array([5,-2,0,2,-1,-2,4, -3,1])
print(np.where(arr>0))
ind = np.where(arr>0)

# print the positive values in arr by using ind?
print("positive values:", arr[ind])

(array([0, 3, 6, 8], dtype=int64),)
positive values: [5 2 4 1]
