### User Defined Dataset
- iris.csv ==> 사용자 정의 데이터셋
- DNN 모델 --> 사용자 정의 모델

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optima
from torch.utils.data import DataLoader, Dataset
from torchmetrics.classification import F1Score
from torchinfo import summary

# LabelEncoding
from sklearn.preprocessing import LabelEncoder

In [3]:
torch.manual_seed(46)

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"device: {DEVICE}")

device: cpu


In [4]:
FILE_PATH = '../../EX_ML/data/iris.csv'

In [5]:
iris_df = pd.read_csv(FILE_PATH)
iris_df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [6]:
encoder = LabelEncoder()
encoder.fit(iris_df['variety'])
iris_df['variety'] = encoder.transform(iris_df['variety'])
iris_df['variety'].value_counts()

variety
0    50
1    50
2    50
Name: count, dtype: int64

- 사용자 정의 데이터셋 클래스 생성
---
- 클래스 목적: 학습용 데이터셋 텐서화 및 전처리 진행
- 부모 클래스: torch.utils.data.DataSet
- 클래스 이름: CustomDataSet
- 매개변수: feature_df, label_df
---

In [7]:
class CustomDataSet(Dataset):
    # data load & preprocessing
    def __init__(self, featuer_df, label_df):
        super().__init__()
        self.feature_df = featuer_df
        self.label_df = label_df
        self.n_rows = featuer_df.shape[0]
        self.n_features = featuer_df.shape[1]
    
    # len(data)
    def __len__(self):
        
        return self.n_rows
    
    # index & return target into tensor
    def __getitem__(self, idx):
        feature_ts = torch.FloatTensor(self.feature_df.iloc[idx].values)
        label_ts = torch.FloatTensor(self.label_df.iloc[idx].values)
        
        return feature_ts, label_ts

- create dataset instance
---

In [8]:
feature_df = iris_df[iris_df.columns[:-1]]
label_df = iris_df[[iris_df.columns[-1]]]

print(f"feature & label shape: {feature_df.shape}, {label_df.shape}")
print(f"feature & label ndim: {feature_df.ndim}D, {label_df.ndim}D")

feature & label shape: (150, 4), (150, 1)
feature & label ndim: 2D, 2D


In [9]:
# iris datasets
iris_ds = CustomDataSet(feature_df, label_df)

In [10]:
iris_ds.feature_df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [11]:
iris_ds.label_df.head()

Unnamed: 0,variety
0,0
1,0
2,0
3,0
4,0


In [12]:
iris_ds[0]

(tensor([5.1000, 3.5000, 1.4000, 0.2000]), tensor([0.]))

- connect DataLoader
- create DataLoader instance
---

In [13]:
# need: DataLoader instance, batch_size: default 1
# a lots of hyperparameters
iris_dl = DataLoader(iris_ds, batch_size=32)    # usually used batch_size is 32

In [14]:
for data_ts, lable_ts in iris_dl:
    print(data_ts.shape, lable_ts.shape)
    break

torch.Size([32, 4]) torch.Size([32, 1])


- model instance
---

- basic create ANN model
    - input layer: input node is len(feature)
    - output layer: output node is len(label)
    - hidden layer: fixed

- model structure
    - datasets: feature 4, label 1, classification
    - learning algorhthm: dacisiontree?, linear?? logistic??
    - input layer   : input 4,   output 20,  activation function: ReLU, sigmoid (To solve gradient vanishing problem)
    - hidden layer  : input 20,  output 10,  activation function: ReLu, sigmoid
    - output layer  : input 10,  output 3,   activation function: softmax 
        - (마지막에는 확률값을 기반으로 결과를 도출하기에, 분류는 필요함. sigmoid, softmax)
        - decision_function의 방법에 따라 output의 결과도 여러개가 나옴. ovr 방식을 일단 채택

In [15]:
class CustomModel(nn.Module):
    # callback function
    def __init__(self):
        super().__init__()
        
        self.input_layer = nn.Linear(4, 20)
        self.hidden_layer = nn.Linear(20, 10)
        self.output_layer = nn.Linear(10, 3)
        
    # forward learning
    # callback funtion
    # 전달인자: 학습용 데이터셋
    def forward(self, x):
        y = F.relu(self.input_layer(x))
        y = F.relu(self.hidden_layer(y))
        y = self.output_layer(y) # classification 이라서.. 
        return y

    

In [16]:
model = CustomModel()
print(model)
summary(model)

CustomModel(
  (input_layer): Linear(in_features=4, out_features=20, bias=True)
  (hidden_layer): Linear(in_features=20, out_features=10, bias=True)
  (output_layer): Linear(in_features=10, out_features=3, bias=True)
)


Layer (type:depth-idx)                   Param #
CustomModel                              --
├─Linear: 1-1                            100
├─Linear: 1-2                            210
├─Linear: 1-3                            33
Total params: 343
Trainable params: 343
Non-trainable params: 0

- learning
---

In [17]:
optimizer = optima.Adam(model.parameters(), lr=0.001)

In [18]:
# 배치크기 만큼 학습 진행
ts_loss, ts_score = [[], []], [[], []]

for epoch in range(10):
    total_loss, total_score = 0, 0
    for data_ts, lable_ts in iris_dl:
        batch_cnt = feature_df.shape[0] // 30
        
        y_pred = model(data_ts)
        # print(y_pred.shape, lable_ts.shape)
        
        lable_ts= lable_ts.reshape(-1).long()
        
        loss = nn.CrossEntropyLoss()(y_pred, lable_ts)
        # 이 친구는 nn에 있음 그 밑에 있는게 아님
        total_loss += loss
        
        score = F1Score(task='multiclass', num_classes=3)(y_pred, lable_ts)
        total_score += score
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        ts_loss[0].append((total_loss/len(iris_dl)).item())
        ts_score[0].append((total_score/len(iris_dl)).item())
        
    

In [19]:
ts_loss[0][:5]

[0.1599808633327484,
 0.39596009254455566,
 0.7305001020431519,
 0.9111267924308777,
 1.07379150390625]

In [20]:
ts_score[0][:5]

[0.20000000298023224, 0.3125, 0.3125, 0.48750001192092896, 0.6875]