# Classification
Where regression was used to predict a numeric value, classification is used to seperate data points into classes of different labels.

### Dataset
This specific dataset seperates flowers into 3 different classes of species.
- Setosa
- Versicolor
- Virginica

The information about each flower is the following.
- sepal length
- sepal width
- petal length
- petal width

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
# With __future__ module's inclusion, you can slowly be accustomed to incompatible changes or 
# to such ones introducing new keywords.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib
# six is a package that helps in writing code that is compatible with both Python 2 and Python 3.

import tensorflow.compat.v2.feature_column as fc

import tensorflow as tf

In [2]:
CSV_COLUMN_NAMES = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Species']
SPECIES = ['Setosa', 'Versicolor', 'Virginica']
# Lets define some constants to help us later on

In [3]:
train_path = tf.keras.utils.get_file(
    "iris_training.csv", "https://storage.googleapis.com/download.tensorflow.org/data/iris_training.csv")
test_path = tf.keras.utils.get_file(
    "iris_test.csv", "https://storage.googleapis.com/download.tensorflow.org/data/iris_test.csv")

train = pd.read_csv(train_path, names=CSV_COLUMN_NAMES, header=0)
test = pd.read_csv(test_path, names=CSV_COLUMN_NAMES, header=0)
# Here we use keras (a module inside of TensorFlow) to grab our datasets and read them into a pandas dataframe

We give all of the coloumns of the dataframe by names attribute and header=0 means row zero is the header of a dataframe.

In [4]:
train.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Species
0,6.4,2.8,5.6,2.2,2
1,5.0,2.3,3.3,1.0,1
2,4.9,2.5,4.5,1.7,2
3,4.9,3.1,1.5,0.1,0
4,5.7,3.8,1.7,0.3,0


In [5]:
train_y = train.pop('Species')
test_y = test.pop('Species')
train.head() # the species column is now gone

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth
0,6.4,2.8,5.6,2.2
1,5.0,2.3,3.3,1.0
2,4.9,2.5,4.5,1.7
3,4.9,3.1,1.5,0.1
4,5.7,3.8,1.7,0.3


In [6]:
train.shape  # we have 120 entires with 4 features

(120, 4)

## Input Function

In [7]:
def input_fn(features, labels, training=True, batch_size=256):
    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    # Shuffle and repeat if you are in training mode.
    if training:
        dataset = dataset.shuffle(1000).repeat()
    
    return dataset.batch(batch_size)

## Feature Columns

In [8]:
# Feature columns describe how to use the input.
my_feature_columns = []
for key in train.keys():
    my_feature_columns.append(tf.feature_column.numeric_column(key=key))
print(my_feature_columns)

[NumericColumn(key='SepalLength', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='SepalWidth', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='PetalLength', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='PetalWidth', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]


In [9]:
# Or we can also make like this our input function.
CSV_COLUMN_NAMES = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth']
# Also we have to define our CSV_COLUMN_NAMES again because to remove the 'Species' in the list.
my_feature_columns = []
for x in CSV_COLUMN_NAMES:
     my_feature_columns.append(tf.feature_column.numeric_column(x, dtype=tf.float32))
print(my_feature_columns)

[NumericColumn(key='SepalLength', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='SepalWidth', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='PetalLength', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='PetalWidth', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]


## Building the Model
And now we are ready to choose a model. For classification tasks there are variety of different estimators/models that we can pick from. Some options are listed below.
- ```DNNClassifier``` (Deep Neural Network)
- ```LinearClassifier```

We can choose either model but the DNN seems to be the best choice. This is because we may not be able to find a linear coorespondence in our data. 

In [10]:
# Build a DNN with 2 hidden layers with 30 and 10 hidden nodes each.
# The estimator modules contains all the premade models from tensor flow
classifier = tf.estimator.DNNClassifier(
    feature_columns=my_feature_columns,
    # Two hidden layers of 30 and 10 nodes respectively.
    # This is like building the architecture of the model
    hidden_units=[30, 10],
    
    # The model must choose between 3 classes.
    # Basically the number of possible outputs or classes that we have for the given problem, in our case 3 classes of flowers
    # Default value of this argument is 2
    n_classes=3)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\SHLOKR~1\\AppData\\Local\\Temp\\tmp7a551bg2', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [11]:
classifier.train(
    # Since we didn't embed the input_fn inside of another function, we make use of lambda here
    # But if we had embedded the input_fn inside of another function we could have simply skipped using lambda
    # We set the training as true because we are currently training the model
    input_fn=lambda: input_fn(train, train_y, training=True),
    
    # steps basically means how many times will our model see the data
    # If steps is set to `None`, train forever or train until `input_fn` generates the `tf.errors.OutOfRange` error 
    # or `StopIteration` exception.
    steps=5000)

Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into C:\Users\SHLOKR~1\AppData\Local\Temp\tmp7a551bg2\model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...
INFO:tensorflow:loss = 1.576372, step = 0
INFO:tensorflow:global_step/sec: 279.193
INFO:tensorflow:loss = 1.220871, step = 100 (0.367 sec)
INFO:tensorflow:global_step/sec: 351.998
INFO:tensorflow:loss = 1.1570055, step = 200 (0.280 sec)
I

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2 at 0x1eda8cb9460>

While the model is training we can see the **loss** after each step. Finally after the model is trained fully we can see the final **loss**. **The lower the loss is, the better the model.**

The only thing to explain here is the **steps** argument. This simply tells the classifier to run for 5000 steps.

## Evaluation

In the statement below **eval_result** is a *dictionary*, so when we used <code>**eval_result</code> we unpacked our dictionary. Thus when we used formatted strings to print the desired output of accuracy, instead of directly calling <code>eval_result['accuracy']</code> we simply called <code>accuracy</code>. Thus **eval_result** which is already unpacked matched the key, in this case accuracy, & substituted that value.

In [12]:
eval_result = classifier.evaluate(
    input_fn=lambda: input_fn(test, test_y, training=False))

print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2021-05-10T22:59:44Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\SHLOKR~1\AppData\Local\Temp\tmp7a551bg2\model.ckpt-5000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 0.65938s
INFO:tensorflow:Finished evaluation at 2021-05-10-22:59:45
INFO:tensorflow:Saving dict for global step 5000: accuracy = 0.9, average_loss = 0.5219545, global_step = 5000, loss = 0.5219545
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 5000: C:\Users\SHLOKR~1\AppData\Local\Temp\tmp7a551bg2\model.ckpt-5000

Test set accuracy: 0.900



In [13]:
print(eval_result)

{'accuracy': 0.9, 'average_loss': 0.5219545, 'loss': 0.5219545, 'global_step': 5000}


In [14]:
# For example, if we input average_loss we get that value 
print('\nTest set average loss: {average_loss:0.3f}\n'.format(**eval_result))


Test set average loss: 0.522



**Alternatively we could have used the following statement**

In [15]:
print(f'\nTest set average loss: {eval_result["average_loss"]:0.3f}\n')


Test set average loss: 0.522



## Prediction

Although we could predict for all the datapoints from the dataset as shown for **linear regression** but here we are only doing it for one entry.

In [16]:
def input_fn(features, batch_size=256):
    # Convert the inputs to a Dataset without labels.
    # We aren't using any labels here is because when we are predicting a value we want the model to give us the labels
    return tf.data.Dataset.from_tensor_slices(dict(features)).batch(batch_size)

# These are the all the features available to us
features = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth']
# Creating a dictionary to store our response
predict = {}

print("Please type numeric values as prompted.")
for feature in features:
    valid = True
    # This is to check if get a valid digit input
    while valid: 
        val = input(feature + ": ")
        # this .isdigit give false for floting point value and true for numerical value.
        # That's why when we enter integer it ask to enter value once again.
        if not val.isdigit(): valid = False
            
    # Store the valid response for that particular feature (key)
    # Now we equated the predict[feature] to a list because .predict() method works on predicting for multiple things 
    # So even if we only have one value we want to predict for its expecting multiple items in that list
    # With each row reperesenting different flower to predict for
    predict[feature] = [float(val)]

# Here the input function is the one made above 
predictions = classifier.predict(input_fn=lambda: input_fn(predict))
# If we gave three inputs in predict than the below for loop runs three times.
for pred_dict in predictions:
    print(pred_dict)
    class_id = pred_dict['class_ids'][0]
    probability = pred_dict['probabilities'][class_id]

    print('Prediction is "{}" ({:.1f}%)'.format(
        SPECIES[class_id], 100 * probability))

# What's happening in the above for loop?
# Answer: After we have predicted what type of flower species we have saved it in the predictions generator which yields a
# dictionary of prediction results for each example(in our case only one). From here we access the 'class_ids' key from our
# dictionary which has saved the result of which species it is in integer form which corresponds to the index position of
# SPECIES list. From this now we come to the 'probabilities' array which saves the information that for this particular example
# what are the percentage that it belong to all the species (for example setosa 0.80, virginica 0.10 & versicolor 0.10). 
# This information is stored in the order in which 'SPECIES' list is formed corresponding to that index.
# But since we know which specie it is from through 'class_ids' we match the probability & output the correct result.

Please type numeric values as prompted.
SepalLength: 1.5
SepalWidth: 0.2
PetalLength: 0.9
PetalWidth: 0.5
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\SHLOKR~1\AppData\Local\Temp\tmp7a551bg2\model.ckpt-5000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
{'logits': array([-0.19346035, -0.21450466, -0.43543082], dtype=float32), 'probabilities': array([0.36176112, 0.35422766, 0.2840112 ], dtype=float32), 'class_ids': array([0], dtype=int64), 'classes': array([b'0'], dtype=object), 'all_class_ids': array([0, 1, 2]), 'all_classes': array([b'0', b'1', b'2'], dtype=object)}
Prediction is "Setosa" (36.2%)


In [17]:
# Here is some example input and expected classes you can try above
expected = ['Setosa', 'Versicolor', 'Virginica']
predict_x = {
    'SepalLength': [5.1, 5.9, 6.9],
    'SepalWidth': [3.3, 3.0, 3.1],
    'PetalLength': [1.7, 4.2, 5.4],
    'PetalWidth': [0.5, 1.5, 2.1],
}

In [18]:
# Now run again predict for above input
predictions = classifier.predict(input_fn=lambda: input_fn(predict_x))
for pred_dict in predictions:
    print(pred_dict)
    class_id = pred_dict['class_ids'][0]
    probability = pred_dict['probabilities'][class_id]

    print('Prediction is "{}" ({:.1f}%)'.format(
        SPECIES[class_id], 100 * probability))


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\SHLOKR~1\AppData\Local\Temp\tmp7a551bg2\model.ckpt-5000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
{'logits': array([ 1.4472338 ,  0.43695745, -1.2846324 ], dtype=float32), 'probabilities': array([0.69968426, 0.25476786, 0.04554782], dtype=float32), 'class_ids': array([0], dtype=int64), 'classes': array([b'0'], dtype=object), 'all_class_ids': array([0, 1, 2]), 'all_classes': array([b'0', b'1', b'2'], dtype=object)}
Prediction is "Setosa" (70.0%)
{'logits': array([-2.3480186, -1.1065042, -1.41342  ], dtype=float32), 'probabilities': array([0.14271358, 0.4939104 , 0.36337608], dtype=float32), 'class_ids': array([1], dtype=int64), 'classes': array([b'1'], dtype=object), 'all_class_ids': array([0, 1, 2]), 'all_classes': array([b'0', b'1', b'2'], dtype=object)}
Prediction is "Versicolor" (49.4