# PP5: deep learning intro

<a href="https://colab.research.google.com/github/PauliusU/PP5-deep-learning-intro/blob/master/PP5_deep_learning_intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Choosing dataset and problem to solve

In [None]:


# Project will use Iris dataset for polynomial non-linearly separable classification.
# In Iris dataset there are three classes, Setosa is linearly separable from the other two classes. Versicolor and Virginica classes are not linearly separable.
# Source: https://www.researchgate.net/figure/Iris-data-set-There-are-three-classes-Setosa-class-is-linearly-separable-from-the-other_fig2_300643220
# Therefore, the project will use only Versicolor and Virginica classes from the Iris dataset.

In [3]:
# Load the dataset

from pprint import pprint
from sklearn.datasets import load_iris

# Initiate Iris flower dataset
dataset = load_iris()
pprint(dataset)

print(dataset["target"])

{'DESCR': '.. _iris_dataset:\n'
          '\n'
          'Iris plants dataset\n'
          '--------------------\n'
          '\n'
          '**Data Set Characteristics:**\n'
          '\n'
          '    :Number of Instances: 150 (50 in each of three classes)\n'
          '    :Number of Attributes: 4 numeric, predictive attributes and the '
          'class\n'
          '    :Attribute Information:\n'
          '        - sepal length in cm\n'
          '        - sepal width in cm\n'
          '        - petal length in cm\n'
          '        - petal width in cm\n'
          '        - class:\n'
          '                - Iris-Setosa\n'
          '                - Iris-Versicolour\n'
          '                - Iris-Virginica\n'
          '                \n'
          '    :Summary Statistics:\n'
          '\n'
          '                    Min  Max   Mean    SD   Class Correlation\n'
          '    sepal length:   4.3  7.9   5.84   0.83    0.7826\n'
          '    sepal wid

In [8]:
import pandas as pd

# Initiate pandas DataFrame with first 4 attributes
df = pd.DataFrame(dataset["data"], columns=[
                  "setal length", "setal width", "petal length", "petal width"])
# Add last (5th) attribute
df["species"] = dataset["target"]
df["species"] = df["species"].apply(lambda x: dataset["target_names"][x])
# Remove rows containing linear "setosa" species
df = df.drop(df[df.species == 'setosa' ].index)
# Reindex
df.reset_index(drop=True, inplace=True)

# Get basic info about DataFrame
print(df.info)  # Concise summary of a DataFrame.
print(df.shape)  # (150, 5)
print(df.head())  # First 5 records
print(df.describe())  # Descriptive statistics
print(df.isnull().sum())
print(df.groupby("species").size())  # Group sizes for each species (class)

<bound method DataFrame.info of     setal length  setal width  petal length  petal width     species
0            7.0          3.2           4.7          1.4  versicolor
1            6.4          3.2           4.5          1.5  versicolor
2            6.9          3.1           4.9          1.5  versicolor
3            5.5          2.3           4.0          1.3  versicolor
4            6.5          2.8           4.6          1.5  versicolor
..           ...          ...           ...          ...         ...
95           6.7          3.0           5.2          2.3   virginica
96           6.3          2.5           5.0          1.9   virginica
97           6.5          3.0           5.2          2.0   virginica
98           6.2          3.4           5.4          2.3   virginica
99           5.9          3.0           5.1          1.8   virginica

[100 rows x 5 columns]>
(100, 5)
   setal length  setal width  petal length  petal width     species
0           7.0          3.2          

In [16]:
# Before implementing any model we need to split the dataset to train and test sets

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Split the data into X and y
X = df.iloc[:,0:4].values
y = df.iloc[:,4].values

# Verify splitting
print(X.shape) # (100, 4)
print(y.shape) # (100,)

# Convert target into LabelEncoder
# Encode target labels with value between 0 and n_classes-1.
# This transformer should be used to encode target values, i.e. y, and not the input X.
encoder =  LabelEncoder()
y1 = encoder.fit_transform(y) # Fit label encoder and return encoded labels.
print(y1)

# Convert X and Y into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Verify splitting again
print(X_train[0:3])
print(X_test[0:3])
print(y_train[0:3])
print(y_test[0:3])

# # Split the dataset to 70% as train and 30% as test datasets
# train, test = train_test_split(df, test_size=0.3)

# # Verify splitting results
# print(train.shape)  # (105, 5)
# print(test.shape)  # (45, 5)

# # Split the train and test sets further as input and output sets
# train_X = train[["setal length", "setal width", "petal length", "petal width"]]
# train_y = train.species
# test_X = test[["setal length", "setal width", "petal length", "petal width"]]
# test_y = test.species

# # Verify splitting results again
# print(train_X.shape)
# print(train_y.shape)
# print(test_X.shape)
# print(test_y.shape)

(100, 4)
(100,)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[[5.  2.3 3.3 1. ]
 [6.8 3.  5.5 2.1]
 [5.5 2.3 4.  1.3]]
[[6.8 2.8 4.8 1.4]
 [6.3 3.4 5.6 2.4]
 [6.9 3.1 4.9 1.5]]
['versicolor' 'virginica' 'versicolor']
['versicolor' 'virginica' 'versicolor']


## Keras

In [None]:
# Define a model
import tensorflow as tf
print(tf.__version__)

model = tf.keras.Sequential([
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
  ])


## PyTorch