In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import torch.nn as nn
import torch

from torch.utils.data import DataLoader  ,TensorDataset

## 1. Create Data

In [2]:
import seaborn as sns

In [3]:
iris = sns.load_dataset("iris")

In [4]:
iris.shape

(150, 5)

In [5]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [6]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


## 2. Converting to tensor 

In [7]:
arr = iris[iris.columns[0:4]].values

#### Predictiors or Independent features

In [8]:
data = torch.tensor(arr ,dtype = torch.float )
data[:5]

tensor([[5.1000, 3.5000, 1.4000, 0.2000],
        [4.9000, 3.0000, 1.4000, 0.2000],
        [4.7000, 3.2000, 1.3000, 0.2000],
        [4.6000, 3.1000, 1.5000, 0.2000],
        [5.0000, 3.6000, 1.4000, 0.2000]])

#### Labels

In [9]:
labels = torch.zeros(size = ( len(iris), )  ,dtype = torch.long)

In [10]:
labels

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0])

In [11]:
iris["species"].value_counts()

setosa        50
versicolor    50
virginica     50
Name: species, dtype: int64

In [12]:
labels[iris["species"] == "versicolor"] = 1

In [13]:
labels[iris["species"] == "virginica"]  = 2

    setosa       0
    versicolor   1
    virginica    2

In [14]:
labels

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2])

## 3. Splitting data into Train , Dev and Test set

#### (i) STEP 1 : Splitting into train and dev_test dataset

In [15]:
# specify sizes of the partitions
# order is : train , devset , test

partitions = [.8,.1,.1]

# split the data 
train_data , Dev_test_data , train_labels , Dev_test_labels = \
                   train_test_split(data, labels, train_size=partitions[0])

In [16]:
train_data

tensor([[6.4000, 2.7000, 5.3000, 1.9000],
        [7.7000, 3.8000, 6.7000, 2.2000],
        [5.9000, 3.0000, 5.1000, 1.8000],
        [6.0000, 2.2000, 5.0000, 1.5000],
        [5.7000, 2.8000, 4.5000, 1.3000],
        [5.1000, 3.4000, 1.5000, 0.2000],
        [5.0000, 3.2000, 1.2000, 0.2000],
        [4.7000, 3.2000, 1.3000, 0.2000],
        [5.2000, 3.4000, 1.4000, 0.2000],
        [7.0000, 3.2000, 4.7000, 1.4000],
        [6.8000, 3.0000, 5.5000, 2.1000],
        [6.7000, 3.3000, 5.7000, 2.1000],
        [6.7000, 3.1000, 4.7000, 1.5000],
        [5.1000, 3.5000, 1.4000, 0.2000],
        [5.6000, 2.5000, 3.9000, 1.1000],
        [5.7000, 4.4000, 1.5000, 0.4000],
        [7.2000, 3.0000, 5.8000, 1.6000],
        [5.8000, 2.8000, 5.1000, 2.4000],
        [6.2000, 2.9000, 4.3000, 1.3000],
        [6.0000, 2.2000, 4.0000, 1.0000],
        [5.9000, 3.0000, 4.2000, 1.5000],
        [5.5000, 4.2000, 1.4000, 0.2000],
        [5.5000, 2.6000, 4.4000, 1.2000],
        [6.0000, 2.9000, 4.5000, 1

In [20]:
len(train_data)

120

In [17]:
train_labels

tensor([2, 2, 2, 2, 1, 0, 0, 0, 0, 1, 2, 2, 1, 0, 1, 0, 2, 2, 1, 1, 1, 0, 1, 1,
        2, 0, 1, 2, 0, 2, 0, 0, 0, 1, 1, 2, 0, 1, 0, 2, 2, 0, 1, 1, 0, 2, 2, 0,
        0, 2, 2, 2, 2, 1, 0, 1, 2, 1, 0, 2, 2, 0, 1, 1, 2, 1, 0, 2, 1, 0, 1, 0,
        1, 0, 2, 0, 1, 0, 2, 0, 2, 0, 0, 2, 0, 1, 2, 2, 1, 0, 1, 1, 1, 2, 1, 2,
        2, 2, 1, 2, 1, 1, 2, 0, 0, 2, 0, 1, 1, 2, 1, 0, 2, 1, 1, 2, 1, 1, 1, 1])

In [21]:
len(train_labels)

120

In [18]:
Dev_test_data

tensor([[4.9000, 3.1000, 1.5000, 0.2000],
        [6.3000, 2.5000, 5.0000, 1.9000],
        [6.1000, 2.9000, 4.7000, 1.4000],
        [6.6000, 2.9000, 4.6000, 1.3000],
        [6.7000, 3.0000, 5.0000, 1.7000],
        [4.9000, 2.4000, 3.3000, 1.0000],
        [4.4000, 2.9000, 1.4000, 0.2000],
        [6.9000, 3.1000, 4.9000, 1.5000],
        [4.9000, 3.1000, 1.5000, 0.1000],
        [6.2000, 3.4000, 5.4000, 2.3000],
        [6.8000, 3.2000, 5.9000, 2.3000],
        [5.1000, 3.7000, 1.5000, 0.4000],
        [6.1000, 3.0000, 4.6000, 1.4000],
        [5.0000, 3.3000, 1.4000, 0.2000],
        [5.6000, 2.8000, 4.9000, 2.0000],
        [5.2000, 4.1000, 1.5000, 0.1000],
        [5.0000, 3.0000, 1.6000, 0.2000],
        [4.8000, 3.4000, 1.9000, 0.2000],
        [7.7000, 2.8000, 6.7000, 2.0000],
        [5.5000, 2.4000, 3.7000, 1.0000],
        [5.0000, 3.5000, 1.3000, 0.3000],
        [5.4000, 3.9000, 1.3000, 0.4000],
        [5.0000, 3.6000, 1.4000, 0.2000],
        [4.8000, 3.4000, 1.6000, 0

In [22]:
len(Dev_test_data)

30

In [19]:
Dev_test_labels

tensor([0, 2, 1, 1, 1, 1, 0, 1, 0, 2, 2, 0, 1, 0, 2, 0, 0, 0, 2, 1, 0, 0, 0, 0,
        0, 2, 2, 2, 0, 2])

In [23]:
len(Dev_test_labels)

30

#### STEP 2 : Splitting the dev and test set equally

In [25]:
split = partitions[1]/np.sum(partitions[1:]) # 50%

dev_data , test_data , dev_labels , test_labels = \
                   train_test_split(Dev_test_data, Dev_test_labels, train_size=split)

In [26]:
dev_data

tensor([[7.7000, 2.8000, 6.7000, 2.0000],
        [4.9000, 2.4000, 3.3000, 1.0000],
        [4.9000, 2.5000, 4.5000, 1.7000],
        [6.8000, 3.2000, 5.9000, 2.3000],
        [4.8000, 3.4000, 1.9000, 0.2000],
        [5.1000, 3.8000, 1.6000, 0.2000],
        [5.8000, 2.7000, 5.1000, 1.9000],
        [6.2000, 3.4000, 5.4000, 2.3000],
        [4.9000, 3.1000, 1.5000, 0.2000],
        [5.4000, 3.9000, 1.3000, 0.4000],
        [6.1000, 3.0000, 4.6000, 1.4000],
        [5.5000, 2.4000, 3.7000, 1.0000],
        [6.7000, 3.0000, 5.0000, 1.7000],
        [6.6000, 2.9000, 4.6000, 1.3000],
        [5.0000, 3.6000, 1.4000, 0.2000]])

In [27]:
len(dev_data)

15

In [28]:
dev_labels

tensor([2, 1, 2, 2, 0, 0, 2, 2, 0, 0, 1, 1, 1, 1, 0])

In [29]:
len(dev_labels)

15

In [30]:
test_data

tensor([[6.1000, 2.9000, 4.7000, 1.4000],
        [5.0000, 3.3000, 1.4000, 0.2000],
        [4.9000, 3.0000, 1.4000, 0.2000],
        [5.6000, 2.8000, 4.9000, 2.0000],
        [6.3000, 2.5000, 5.0000, 1.9000],
        [5.0000, 3.5000, 1.3000, 0.3000],
        [5.7000, 2.5000, 5.0000, 2.0000],
        [4.8000, 3.4000, 1.6000, 0.2000],
        [5.2000, 4.1000, 1.5000, 0.1000],
        [5.1000, 3.7000, 1.5000, 0.4000],
        [4.9000, 3.1000, 1.5000, 0.1000],
        [4.4000, 2.9000, 1.4000, 0.2000],
        [5.0000, 3.0000, 1.6000, 0.2000],
        [6.4000, 2.8000, 5.6000, 2.2000],
        [6.9000, 3.1000, 4.9000, 1.5000]])

In [31]:
len(test_data)

15

In [32]:
test_labels

tensor([1, 0, 0, 2, 2, 0, 2, 0, 0, 0, 0, 0, 0, 2, 1])

In [33]:
len(test_labels)

15