# Machine Learning Tools

## NumPy

### 1. Data Types and Attributes

* Numpy basically has one type of data which is __'ndarray'__ which stands for __n-dimensional array__.
* It's basically a substitute to python list because it's faster than python list and has many useful features needed for machine learning.

In [1]:
# lets import everything just incase
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

In [2]:
a1 = np.array([1,2,3.2])
a1

array([1. , 2. , 3.2])

In [3]:
# Lets check the type of the a1 array
type(a1)

numpy.ndarray

In [4]:
a2 = np.array([[1,2,3],[4,5,6]])
a2

array([[1, 2, 3],
       [4, 5, 6]])

In [5]:
a3 = np.array([[[1,2,3],
                [4,5,6],
                [7,8,9]],
               [[10,11,12],
                [13,14,15],
                [16,17,18]]])
a3

array([[[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9]],

       [[10, 11, 12],
        [13, 14, 15],
        [16, 17, 18]]])

In [6]:
# Following are some of the attributes of the ndarray elements
a1.shape, a2.shape , a3.shape

((3,), (2, 3), (2, 3, 3))

![Anatomy of ndarray](./images/anatomy_of_ndarray.png "Anatomy of ndarrays")

In [78]:
# to see which type of data nd array containe we can access the .dtype attribute
a1.dtype, a2.dtype, a3.dtype 

(dtype('float64'), dtype('int32'), dtype('int32'))

In [79]:
# .size attribute gives how many elements it contain within the nd array
a1.size, a2.size, a3.size

(3, 6, 18)

In [80]:
# .ndim attribute gives the dimension of the array
a1.ndim, a2.ndim, a3.ndim

(1, 2, 3)

### 2. Creating arrays

In [81]:
# Normal Array
arry = np.array([1,2,3])
arry

array([1, 2, 3])

In [82]:
# Get already filled with values

# Array filled with ones
ones =np.ones((2,3))
ones

array([[1., 1., 1.],
       [1., 1., 1.]])

In [83]:
ones.shape # it has the shape we given to it

(2, 3)

In [84]:
# array filled with zeroes
zeros = np.zeros((2,3))
zeros 

array([[0., 0., 0.],
       [0., 0., 0.]])

In [85]:
zeros.shape

(2, 3)

In [86]:
# get a range array
arry2 = np.arange(0,20,5)
arry2

array([ 0,  5, 10, 15])

In [87]:
# Filling arrays with random numbers

# method 1

ran1 =np.random.randint(1,10,size=20)
ran1

array([2, 6, 8, 1, 6, 4, 8, 5, 7, 1, 4, 3, 7, 6, 3, 9, 7, 8, 4, 6])

In [88]:
ran1.size

20

In [89]:
# method 2
ran2 =np.random.rand(3,3)
ran2

array([[0.96990863, 0.00890753, 0.91106163],
       [0.73807665, 0.60559044, 0.51655363],
       [0.76869246, 0.41940279, 0.70827176]])

In [90]:
ran2.shape

(3, 3)

In [91]:
# method 3
ran3 = np.random.random((3,5))
ran3

array([[0.40238618, 0.35666473, 0.13534987, 0.80562204, 0.22055787],
       [0.90425282, 0.10567676, 0.54492841, 0.31494888, 0.94255656],
       [0.46425659, 0.81058633, 0.82377743, 0.7139506 , 0.45049476]])

In [92]:
# These Random numbers are pseudo-random numbers
# they change everytime, but wht if we wanted to share our notebook with someone they wont have the same result

np.random.seed(4)
ran4 =np.random.randint(1,10,size=(4,5))
ran4

# No matter how many times I run it it give the same random number, if seed changes inly it changes

array([[8, 6, 2, 9, 8],
       [9, 3, 8, 8, 8],
       [9, 5, 3, 7, 5],
       [4, 1, 8, 6, 6]])

### 3. Viewing Arrays

In [93]:
a1

array([1. , 2. , 3.2])

In [94]:
a2

array([[1, 2, 3],
       [4, 5, 6]])

In [95]:
a3

array([[[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9]],

       [[10, 11, 12],
        [13, 14, 15],
        [16, 17, 18]]])

In [96]:
# We can use normal array notation to access elements as follows
a1[1]

2.0

In [97]:
a2[0]

array([1, 2, 3])

In [98]:
a2[0][2]

3

In [99]:
# we can use slicing as usual
# It gets tricky when dimensions are higher
rand_arr1 = np.random.randint(0,20,15)
rand_arr1

array([ 9,  6,  6,  2, 14,  8, 17,  2,  0,  8, 19, 10, 13,  1,  0])

In [100]:
rand_arr1.shape

(15,)

In [101]:
rand_arr1[2:10:2]

array([ 6, 14, 17,  0])

In [102]:
rand_arr1 = np.random.randint(0,20,(5,5))
rand_arr1

array([[13,  3, 18, 19, 15],
       [ 2, 18, 17, 16, 17],
       [ 9, 16, 12, 18,  3],
       [11, 18,  6,  7, 13],
       [10,  7,  9, 18, 19]])

In [103]:
rand_arr1.shape

(5, 5)

In [104]:
# Then we can do slicing in each dimension
rand_arr1[:3]

array([[13,  3, 18, 19, 15],
       [ 2, 18, 17, 16, 17],
       [ 9, 16, 12, 18,  3]])

In [105]:
rand_arr1[:3:2]

array([[13,  3, 18, 19, 15],
       [ 9, 16, 12, 18,  3]])

In [106]:
rand_arr1[:3:2,:2]

array([[13,  3],
       [ 9, 16]])

In [107]:
rand_arr1[:3:2,::2]

array([[13, 18, 15],
       [ 9, 12,  3]])

In [108]:
# we can do the same for any dimension array just need to practice

### 4. Manupulating and Comparing arrays

#### Arithmetic Operations

In [109]:
# Lets try arithmentic operations
a1

array([1. , 2. , 3.2])

In [110]:
ones =np.ones(3)
ones

array([1., 1., 1.])

In [111]:
# + Operation
a1+ones

array([2. , 3. , 4.2])

In [112]:
a1

array([1. , 2. , 3.2])

In [113]:
ones

array([1., 1., 1.])

In [114]:
# - Operation
a1-ones

array([0. , 1. , 2.2])

In [115]:
a2

array([[1, 2, 3],
       [4, 5, 6]])

In [116]:
ones

array([1., 1., 1.])

In [117]:
a2-ones

array([[0., 1., 2.],
       [3., 4., 5.]])

In [118]:
# Interesting, Because according to our understanding matric need to be same shape to be + or -

In [119]:
a1

array([1. , 2. , 3.2])

In [120]:
a2

array([[1, 2, 3],
       [4, 5, 6]])

In [121]:
a1+a2

array([[2. , 4. , 6.2],
       [5. , 7. , 9.2]])

In [122]:
a1

array([1. , 2. , 3.2])

In [123]:
a2

array([[1, 2, 3],
       [4, 5, 6]])

In [124]:
a2-a1

array([[ 0. ,  0. , -0.2],
       [ 3. ,  3. ,  2.8]])

In [125]:
a1

array([1. , 2. , 3.2])

In [126]:
a2

array([[1, 2, 3],
       [4, 5, 6]])

In [127]:
a1-a2

array([[ 0. ,  0. ,  0.2],
       [-3. , -3. , -2.8]])

In [128]:
a1.shape, a2.shape

((3,), (2, 3))

In [129]:
# * Operatoe
a1*a2

array([[ 1. ,  4. ,  9.6],
       [ 4. , 10. , 19.2]])

In [130]:
# / operator
a1/a2

array([[1.        , 1.        , 1.06666667],
       [0.25      , 0.4       , 0.53333333]])

In [131]:
a1 

array([1. , 2. , 3.2])

In [132]:
a2

array([[1, 2, 3],
       [4, 5, 6]])

In [133]:
# % operator
a1%a2

array([[0. , 0. , 0.2],
       [1. , 2. , 3.2]])

In [134]:
a2

array([[1, 2, 3],
       [4, 5, 6]])

In [135]:
# .sum() method
a2.sum()

21

In [136]:
# .mean() method
a2.mean()

3.5

In [137]:
# .max() & .min() methods
a2.max(), a2.min()

(6, 1)

In [138]:
a2

array([[1, 2, 3],
       [4, 5, 6]])

In [139]:
a3

array([[[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9]],

       [[10, 11, 12],
        [13, 14, 15],
        [16, 17, 18]]])

In [140]:
a2+a3

ValueError: operands could not be broadcast together with shapes (2,3) (2,3,3) 

In [141]:
# Hmm why is this
# lets check the shape
a2.shape, a3.shape

((2, 3), (2, 3, 3))

In [142]:
#We have to reshape the a2 in to a similar shape

* "NumPy operations are usually done on pairs of arrays on an __element-by-element basis__. In the simplest case, the two arrays __must have exactly the same shape__." ~NumPy Documentation

* When operating on two arrays, NumPy compares their shapes element-wise. It starts with the trailing (i.e. rightmost) dimension and works its way left. Two dimensions are compatible when
    1. they are equal, or
    2. one of them is 1


* If these conditions are not met, a `ValueError: operands could not be broadcast together` exception is thrown, indicating that the arrays have incompatible shapes

~NumPy Documentation

In [143]:
# Since a2 shape is (2,3) we have to reshape it into a shape of (2,3,3) or (2,3,1)
a2_reshaped1 = a2.reshape((2,3,1))

In [144]:
a2_reshaped1

array([[[1],
        [2],
        [3]],

       [[4],
        [5],
        [6]]])

In [145]:
a3

array([[[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9]],

       [[10, 11, 12],
        [13, 14, 15],
        [16, 17, 18]]])

In [146]:
a2_reshaped1 * a3

array([[[  1,   2,   3],
        [  8,  10,  12],
        [ 21,  24,  27]],

       [[ 40,  44,  48],
        [ 65,  70,  75],
        [ 96, 102, 108]]])

#### Aggregation

Aggregation = doing the same operation for multiple elements at the same time 

* __Rule of thumb :__ Always use python methods (`sum()`) on python data structures and Numpy methods (`np.sum()`) on NumPy data structures.

In [147]:
# lets see this in action

test_arry = np.random.random(100000)
test_arry[:10]

array([0.35876637, 0.08316909, 0.25290961, 0.0474055 , 0.92940665,
       0.23614132, 0.81090494, 0.12618166, 0.07570637, 0.68558821])

In [148]:
%timeit sum(test_arry) # Python's sum Method
%timeit np.sum(test_arry) # NumPy's sum Method

8.38 ms ± 523 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
44.3 µs ± 765 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


#### Reshaping and Transpose

In [149]:
# We discussed Reshape in above section

In [150]:
# Transpose
a2

array([[1, 2, 3],
       [4, 5, 6]])

In [151]:
a2.shape

(2, 3)

In [152]:
a2_trans= a2.transpose()
a2_trans

array([[1, 4],
       [2, 5],
       [3, 6]])

In [153]:
a2_trans.shape

(3, 2)

In [154]:
a3

array([[[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9]],

       [[10, 11, 12],
        [13, 14, 15],
        [16, 17, 18]]])

In [155]:
a3.shape

(2, 3, 3)

In [156]:
a3_trans = a3.transpose()
a3_trans

array([[[ 1, 10],
        [ 4, 13],
        [ 7, 16]],

       [[ 2, 11],
        [ 5, 14],
        [ 8, 17]],

       [[ 3, 12],
        [ 6, 15],
        [ 9, 18]]])

In [157]:
a3_trans.shape

(3, 3, 2)

In [158]:
# Dot Product VS Element wise Product

np.random.seed(0) # So that random values wo't change overtime
mat1 = np.random.randint(0,20,(5,3))
mat2 = np.random.randint(0,20,(5,3))

In [159]:
mat1

array([[12, 15,  0],
       [ 3,  3,  7],
       [ 9, 19, 18],
       [ 4,  6, 12],
       [ 1,  6,  7]])

In [160]:
mat2

array([[14, 17,  5],
       [13,  8,  9],
       [19, 16, 19],
       [ 5, 15, 15],
       [ 0, 18,  3]])

In [161]:
np.dot(mat1,mat2)

ValueError: shapes (5,3) and (5,3) not aligned: 3 (dim 1) != 5 (dim 0)

In [164]:
# Values of the  shapes inner value should be same
mat3 = np.dot(mat1,mat2.T)
mat3

array([[423, 276, 468, 285, 270],
       [128, 126, 238, 165,  75],
       [539, 431, 817, 600, 396],
       [218, 208, 400, 290, 144],
       [151, 124, 248, 200, 129]])

In [165]:
mat3.shape

(5, 5)

In [166]:
mat1

array([[12, 15,  0],
       [ 3,  3,  7],
       [ 9, 19, 18],
       [ 4,  6, 12],
       [ 1,  6,  7]])

In [167]:
mat2

array([[14, 17,  5],
       [13,  8,  9],
       [19, 16, 19],
       [ 5, 15, 15],
       [ 0, 18,  3]])

In [168]:
# Element wise
np.multiply(mat1,mat2)

array([[168, 255,   0],
       [ 39,  24,  63],
       [171, 304, 342],
       [ 20,  90, 180],
       [  0, 108,  21]])

In [169]:
# Nutty Butter store example
np.random.seed(0)
daily_sales_arry =np.random.randint(0,20,(5,3))
daily_sales_arry

array([[12, 15,  0],
       [ 3,  3,  7],
       [ 9, 19, 18],
       [ 4,  6, 12],
       [ 1,  6,  7]])

In [170]:
# Lets create a dataframe using this

daily_sales_df = pd.DataFrame(daily_sales_arry,index=['Mon','Tues','Wen','Thur','Fri'],columns=['Almond Butter', 'Peanut Butter','Cashew Butter'])
daily_sales_df

Unnamed: 0,Almond Butter,Peanut Butter,Cashew Butter
Mon,12,15,0
Tues,3,3,7
Wen,9,19,18
Thur,4,6,12
Fri,1,6,7


In [171]:
# shape of the data frame
daily_sales_df.shape

(5, 3)

In [172]:
#lets create a price dataframe
price_arry = np.ndarray([22,15,18])

# Creating a price dataframe
price_df = pd.DataFrame(price_arry,index=['price'], columns=['Almond Butter', 'Peanut Butter','Cashew Butter'])
price_df

ValueError: Must pass 2-d input. shape=(22, 15, 18)

In [173]:
#lets create a price dataframe
price_arry = np.array([22,15,18])
price_arry.shape

# since this weierd thing we have to reshape it into 1,3

(3,)

In [174]:
# Creating a price dataframe
price_df = pd.DataFrame(price_arry.reshape((1,3)),index=['price'], columns=['Almond Butter', 'Peanut Butter','Cashew Butter'])
price_df

Unnamed: 0,Almond Butter,Peanut Butter,Cashew Butter
price,22,15,18


In [175]:
# Lets try that series way that gives an error
price_arry

array([22, 15, 18])

In [176]:
daily_sales_arry

array([[12, 15,  0],
       [ 3,  3,  7],
       [ 9, 19, 18],
       [ 4,  6, 12],
       [ 1,  6,  7]])

In [177]:
total_sales_arr = daily_sales_arry.dot(price_arry.T)
total_sales_arr, type(total_sales_arr)

(array([489, 237, 807, 394, 238]), numpy.ndarray)

In [178]:
total_sales_series = pd.Series(total_sales_arr)
total_sales_series

0    489
1    237
2    807
3    394
4    238
dtype: int32

In [179]:
daily_sales_df

Unnamed: 0,Almond Butter,Peanut Butter,Cashew Butter
Mon,12,15,0
Tues,3,3,7
Wen,9,19,18
Thur,4,6,12
Fri,1,6,7


In [180]:
daily_sales_df['new']=total_sales_series
daily_sales_df

Unnamed: 0,Almond Butter,Peanut Butter,Cashew Butter,new
Mon,12,15,0,
Tues,3,3,7,
Wen,9,19,18,
Thur,4,6,12,
Fri,1,6,7,


In [181]:
# so an error comes lets see our series index was changed
total_sales_series_indexed = pd.Series(total_sales_arr,index=['Mon','Tues','Wen','Thur','Fri'])
total_sales_series_indexed

Mon     489
Tues    237
Wen     807
Thur    394
Fri     238
dtype: int32

In [182]:
daily_sales_df.drop('new',inplace=True, axis=1)
daily_sales_df

Unnamed: 0,Almond Butter,Peanut Butter,Cashew Butter
Mon,12,15,0
Tues,3,3,7
Wen,9,19,18
Thur,4,6,12
Fri,1,6,7


In [183]:
# Lets try adding the new series
daily_sales_df['new']=total_sales_series_indexed
daily_sales_df

# Boom it works, we just have to have same indexes

Unnamed: 0,Almond Butter,Peanut Butter,Cashew Butter,new
Mon,12,15,0,489
Tues,3,3,7,237
Wen,9,19,18,807
Thur,4,6,12,394
Fri,1,6,7,238


In [184]:
daily_sales_arry.shape

(5, 3)

In [185]:
price_arry.shape

(3,)

In [186]:
total_sales = daily_sales_arry.dot(price_arry.T)
total_sales, type(total_sales)

(array([489, 237, 807, 394, 238]), numpy.ndarray)

In [187]:
daily_sales_df['Total Sales ($)'] = daily_sales_arry.dot(price_arry.T)
daily_sales_df

Unnamed: 0,Almond Butter,Peanut Butter,Cashew Butter,new,Total Sales ($)
Mon,12,15,0,489,489
Tues,3,3,7,237,237
Wen,9,19,18,807,807
Thur,4,6,12,394,394
Fri,1,6,7,238,238


In [188]:
price_df

Unnamed: 0,Almond Butter,Peanut Butter,Cashew Butter
price,22,15,18


In [189]:
daily_sales_df.drop('Total Sales ($)',inplace=True, axis=1)
daily_sales_df

Unnamed: 0,Almond Butter,Peanut Butter,Cashew Butter,new
Mon,12,15,0,489
Tues,3,3,7,237
Wen,9,19,18,807
Thur,4,6,12,394
Fri,1,6,7,238


In [190]:
daily_sales_df.shape, price_df.shape

((5, 4), (1, 3))

In [198]:
#We can do the same thing with directly dataframes as well
daily_sales_df.drop('new',axis=1,inplace=True)
total_sales_df = daily_sales_df.dot(price_df.T)
total_sales_df

Unnamed: 0,price
Mon,489
Tues,237
Wen,807
Thur,394
Fri,238


In [199]:
daily_sales_df['Total Sales ($)']=total_sales_df
daily_sales_df

Unnamed: 0,Almond Butter,Peanut Butter,Cashew Butter,Total Sales ($)
Mon,12,15,0,489
Tues,3,3,7,237
Wen,9,19,18,807
Thur,4,6,12,394
Fri,1,6,7,238


#### Comparison and Sorting

In [200]:
a1

array([1. , 2. , 3.2])

In [201]:
a2

array([[1, 2, 3],
       [4, 5, 6]])

In [202]:
a1 > a2

array([[False, False,  True],
       [False, False, False]])

In [203]:
a2>a1

array([[False, False, False],
       [ True,  True,  True]])

In [204]:
a1 == a2

array([[ True,  True, False],
       [False, False, False]])

In [205]:
a1 >= a2

array([[ True,  True,  True],
       [False, False, False]])

In [206]:
rand_arr1

array([[13,  3, 18, 19, 15],
       [ 2, 18, 17, 16, 17],
       [ 9, 16, 12, 18,  3],
       [11, 18,  6,  7, 13],
       [10,  7,  9, 18, 19]])

In [207]:
# Lets Sort
rand_arr1.sort()
rand_arr1

array([[ 3, 13, 15, 18, 19],
       [ 2, 16, 17, 17, 18],
       [ 3,  9, 12, 16, 18],
       [ 6,  7, 11, 13, 18],
       [ 7,  9, 10, 18, 19]])

In [208]:
rand_arr1.sort(axis=0)
rand_arr1

array([[ 2,  7, 10, 13, 18],
       [ 3,  9, 11, 16, 18],
       [ 3,  9, 12, 17, 18],
       [ 6, 13, 15, 18, 19],
       [ 7, 16, 17, 18, 19]])

### Importing Images as NumPy Arrays

In [209]:
from matplotlib.image import imread 

In [210]:
# Imread read images into ndarrays

<img src="images/panda.png"/>

In [211]:
#Lets turn the panda into an array
panda = imread('images/panda.png')

In [212]:
panda.size, panda.ndim, panda.shape, type(panda)

(24465000, 3, (2330, 3500, 3), numpy.ndarray)

In [213]:
panda[:2]

array([[[0.05490196, 0.10588235, 0.06666667],
        [0.05490196, 0.10588235, 0.06666667],
        [0.05490196, 0.10588235, 0.06666667],
        ...,
        [0.16470589, 0.12941177, 0.09411765],
        [0.16470589, 0.12941177, 0.09411765],
        [0.16470589, 0.12941177, 0.09411765]],

       [[0.05490196, 0.10588235, 0.06666667],
        [0.05490196, 0.10588235, 0.06666667],
        [0.05490196, 0.10588235, 0.06666667],
        ...,
        [0.16470589, 0.12941177, 0.09411765],
        [0.16470589, 0.12941177, 0.09411765],
        [0.16470589, 0.12941177, 0.09411765]]], dtype=float32)