#### Loading Dataset

Dataset has two components:
- Features
    - Feature Matrix
    - Feature Names
- Response
    - Response Vector
    - Target Names 

In [4]:
from sklearn.datasets import load_iris

iris = load_iris()

X = iris.data # X is the feature matrix
y = iris.target # y is the response vector

feature_names = iris.feature_names
target_names = iris.target_names

print("Feature names:", feature_names)
print("Target names:", target_names)
print("\nFirst 10 rows of X:\n", X[:10])

Feature names: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Target names: ['setosa' 'versicolor' 'virginica']

First 10 rows of X:
 [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]]


**Spliting dataset into training set and testing set**

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()

X = iris.data
y = iris.target

# for 150 rows of X, produce test data of 150*0.3 = 45 rows.
X_train, X_test, y_train, y_test = train_test_split(
   X, y, test_size = 0.3, random_state = 1
)

print(X_train.shape)
print(X_test.shape)

print(y_train.shape)
print(y_test.shape)

(105, 4)
(45, 4)
(105,)
(45,)


##### Do some training

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(
   X, y, test_size = 0.4, random_state=1
)

# Train with KNN classifier algorithm
classifier_knn = KNeighborsClassifier(n_neighbors = 3)
classifier_knn.fit(X_train, y_train)

# Comparing test response value with predicted response value
y_pred = classifier_knn.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

# Make predictions on sample data
sample = [
    [5, 5, 3, 2], 
    [2, 4, 3, 5]
]
preds = classifier_knn.predict(sample)
pred_species = [iris.target_names[p] for p in preds] 

print("Predictions:", preds)
print("Prediction Species:", pred_species)

Accuracy: 0.9833333333333333
Predictions: [1 2]
Prediction Species: [np.str_('versicolor'), np.str_('virginica')]


##### Persist trained model

In [10]:
import joblib

joblib.dump(classifier_knn, 'trained_models/iris_classifier_knn.joblib')

['trained_models/iris_classifier_knn.joblib']

**Reload the model from filesystem**

In [14]:
classifier_knn = joblib.load('trained_models/iris_classifier_knn.joblib')

sample = [
    [5, 5, 3, 2], 
    [2, 4, 3, 5]
]
preds = classifier_knn.predict(sample)
pred_species = [iris.target_names[p] for p in preds] 

print("Prediction Species:", pred_species)

Prediction Species: [np.str_('versicolor'), np.str_('virginica')]


#### Preprocessing Data

##### Binarisation

With `threshold=0.5`, All the values above 0.5 would be converted to 1, and all the values below 0.5 would be converted to 0.

In [18]:
import numpy as np
from sklearn import preprocessing

input_data = np.array([
    [2.1, -1.9, 5.5],
    [-1.5, 2.4, 3.5],
    [0.5, -7.9, 5.6],
    [5.9, 2.3, -5.8]
])
binarizer = preprocessing.Binarizer(threshold=0.5)
data_binarized = binarizer.transform(input_data)
print("\nBinarized data:\n", data_binarized)


Binarized data:
 [[1. 0. 1.]
 [0. 1. 1.]
 [0. 0. 1.]
 [1. 1. 0.]]


##### Mean Removal

Eliminate the mean from feature vector so that every feature centered on zero.

In [None]:
import numpy as np
from sklearn import preprocessing

input_data = np.array([
   [2.1, -1.9, 5.5],
   [-1.5, 2.4, 3.5],
   [0.5, -7.9, 5.6],
   [5.9, 2.3, -5.8]
])

print("Mean =", input_data.mean(axis=0))
print("Standard deviation = ", input_data.std(axis=0))

# Removing the mean and the standard deviation of the input data
data_scaled = preprocessing.scale(input_data)
print("Mean_removed =", data_scaled.mean(axis=0))
print("Stddev_removed =", data_scaled.std(axis=0))

Mean = [ 1.75  -1.275  2.2  ]
Standard deviation =  [2.71431391 4.20022321 4.69414529]
Mean_removed = [1.11022302e-16 0.00000000e+00 0.00000000e+00]
Stddev_removed = [1. 1. 1.]


##### Scaling the Feature Vectors

In [None]:
import numpy as np
from sklearn import preprocessing

input_data = np.array([
    [2.1, -1.9, 5.5],
    [-1.5, 2.4, 3.5],
    [0.5, -7.9, 5.6],
    [5.9, 2.3, -5.8]
])
minmax_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
data_scaled_minmax = minmax_scaler.fit_transform(input_data)
print ("\nMin max scaled data:\n", data_scaled_minmax)


Min max scaled data:
 [[0.48648649 0.58252427 0.99122807]
 [0.         1.         0.81578947]
 [0.27027027 0.         1.        ]
 [1.         0.99029126 0.        ]]


##### L1 Normalisation

Also called Least Absolute Deviations. It modifies the value in such a manner that the sum of the absolute values remains always up to 1 in each row.

In [None]:
from sklearn import preprocessing

input_data = np.array([
    [2.1, -1.9, 5.5],
    [-1.5, 2.4, 3.5],
    [0.5, -7.9, 5.6],
    [5.9, 2.3, -5.8]
])
data_normalized_l1 = preprocessing.normalize(input_data, norm='l1')
print("\nL1 normalized data:\n", data_normalized_l1)


L1 normalized data:
 [[ 0.22105263 -0.2         0.57894737]
 [-0.2027027   0.32432432  0.47297297]
 [ 0.03571429 -0.56428571  0.4       ]
 [ 0.42142857  0.16428571 -0.41428571]]


##### L2 Normalisation

Also called Least Squares. It modifies the value in such a manner that the sum of the squares remains always up to 1 in each row.

In [None]:
import numpy as np
from sklearn import preprocessing

input_data = np.array([
    [2.1, -1.9, 5.5],
    [-1.5, 2.4, 3.5],
    [0.5, -7.9, 5.6],
    [5.9, 2.3, -5.8]
])
data_normalized_l2 = preprocessing.normalize(input_data, norm='l2')
print("\nL2 normalized data:\n", data_normalized_l2)


L1 normalized data:
 [[ 0.33946114 -0.30713151  0.88906489]
 [-0.33325106  0.53320169  0.7775858 ]
 [ 0.05156558 -0.81473612  0.57753446]
 [ 0.68706914  0.26784051 -0.6754239 ]]
