## CH02

In [1]:
import torch

In [2]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
dataset = pd.read_csv("data/car_evaluation.csv")

In [4]:
dataset.head()

Unnamed: 0,price,maint,doors,persons,lug_capacity,safety,output
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [5]:
categorical_columns = ["price", "maint", "doors", "persons", "lug_capacity", "safety"] # 임의로 나눠 놓음, ex) 폴더로 다 구분해 놓음

In [6]:
for category in categorical_columns:
    dataset[category] = dataset[category].astype("category")

In [7]:
price = dataset["price"].cat.codes.values
maint = dataset["maint"].cat.codes.values
doors = dataset["doors"].cat.codes.values
persons = dataset["persons"].cat.codes.values
lug_capacity = dataset["lug_capacity"].cat.codes.values
safety = dataset["safety"].cat.codes.values

categorical_data = np.stack([price, maint, doors, persons, lug_capacity, safety], 1)
categorical_data[:10]

array([[3, 3, 0, 0, 2, 1],
       [3, 3, 0, 0, 2, 2],
       [3, 3, 0, 0, 2, 0],
       [3, 3, 0, 0, 1, 1],
       [3, 3, 0, 0, 1, 2],
       [3, 3, 0, 0, 1, 0],
       [3, 3, 0, 0, 0, 1],
       [3, 3, 0, 0, 0, 2],
       [3, 3, 0, 0, 0, 0],
       [3, 3, 0, 1, 2, 1]], dtype=int8)

In [8]:
categorical_data = torch.tensor(categorical_data, dtype=torch.int64)
categorical_data[:10]

tensor([[3, 3, 0, 0, 2, 1],
        [3, 3, 0, 0, 2, 2],
        [3, 3, 0, 0, 2, 0],
        [3, 3, 0, 0, 1, 1],
        [3, 3, 0, 0, 1, 2],
        [3, 3, 0, 0, 1, 0],
        [3, 3, 0, 0, 0, 1],
        [3, 3, 0, 0, 0, 2],
        [3, 3, 0, 0, 0, 0],
        [3, 3, 0, 1, 2, 1]])

In [9]:
outputs = pd.get_dummies(dataset.output)
outputs = outputs.values
outputs = torch.tensor(outputs).flatten()

print(categorical_data.shape)
print(outputs.shape)

torch.Size([1728, 6])
torch.Size([6912])


In [10]:
categorical_column_sizes = [
    len(dataset[column].cat.categories) for column in categorical_columns
]


categorical_embedding_sizes = [
    (col_size, min(50, (col_size + 1) // 2)) for col_size in categorical_column_sizes
]


print(categorical_embedding_sizes)

[(4, 2), (4, 2), (4, 2), (3, 2), (3, 2), (3, 2)]


In [11]:
total_records = 1728
test_records = int(total_records * 0.2)

categorical_train_data = categorical_data[: total_records - test_records]
categorical_test_data = categorical_data[total_records - test_records : total_records]
train_outputs = outputs[: total_records - test_records]
test_outputs = outputs[total_records - test_records : total_records]

In [12]:
print(len(categorical_train_data)) # 훈련데이터
print(len(train_outputs))
print(len(categorical_test_data)) # 테스트 데이터
print(len(test_outputs))

1383
1383
345
345


In [13]:
class Model(nn.Module):
    def __init__(self, embedding_size, output_size, layers, p=0.4):
        super().__init__()
        self.all_embeddings = nn.ModuleList(
            [nn.Embedding(ni, nf) for ni, nf in embedding_size] # 레이어를 여러개 만듦
        )
        self.embedding_dropout = nn.Dropout(p)
        all_layers = []
        num_categorical_cols = sum((nf for ni, nf in embedding_size))
        input_size = num_categorical_cols
        for i in layers:
            all_layers.append(nn.Linear(input_size, i)) # 입력(input_size), 출력(i)
            all_layers.append(nn.ReLU(inplace=True)) # ReLU : 활성화 함수 - 입력으로 들어감, 출력값을 변화시키는 비선형 함수(기울기 소실 문제 해결방법)
            all_layers.append(nn.BatchNorm1d(i))
            all_layers.append(nn.Dropout(p)) # dropout : 과적합을 해결하기 위한 방법
            input_size = i
        all_layers.append(nn.Linear(layers[-1], output_size))
        self.layers = nn.Sequential(*all_layers) # 레이어들을 하나로 합침

    def forward(self, x_categorical): # 실제 GPU에서 계산되는 함수
        embeddings = []
        for i, e in enumerate(self.all_embeddings):
            embeddings.append(e(x_categorical[:, i]))
        x = torch.cat(embeddings, 1)
        x = self.embedding_dropout(x)
        x = self.layers(x)
        return x

In [14]:
model = Model(categorical_embedding_sizes, 4, [200, 100, 50], p=0.4) # 3개의 레이어가층을 돎
print(model)

Model(
  (all_embeddings): ModuleList(
    (0-2): 3 x Embedding(4, 2)
    (3-5): 3 x Embedding(3, 2)
  )
  (embedding_dropout): Dropout(p=0.4, inplace=False)
  (layers): Sequential(
    (0): Linear(in_features=12, out_features=200, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=200, out_features=100, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=100, out_features=50, bias=True)
    (9): ReLU(inplace=True)
    (10): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.4, inplace=False)
    (12): Linear(in_features=50, out_features=4, bias=True)
  )
)


In [15]:
loss_function = nn.CrossEntropyLoss() # 분류 문제 - 원-핫 인코딩 했을 때만 사용할 수 있는 오차 계산법
optimizer = torch.optim.Adam(model.parameters(), lr=0.001) # 아담 optimizer, lr(조금씩 내려가는 양)

In [16]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [17]:
epochs = 500
aggregated_losses = []
train_outputs = train_outputs.to(device=device, dtype=torch.int64)
for i in range(epochs):
    i += 1
    # 미니 배치 경사 하강법
    for X_train, y_label in categorical_train_data:
        y_pred = model(X_train)
        single_loss = loss_function(y_pred, train_outputs)
        aggregated_losses.append(single_loss)

    if i % 25 == 1:
        print(f"epoch: {i:3} loss: {single_loss.item():10.8f}")

    optimizer.zero_grad()
    single_loss.backward()
    optimizer.step()

print(f"epoch: {i:3} loss: {single_loss.item():10.10f}")

ValueError: too many values to unpack (expected 2)

In [None]:
epochs = 500
aggregated_losses = []
train_outputs = train_outputs.to(device=device, dtype=torch.int64)
for i in range(epochs):
    i += 1
    y_pred = model(categorical_train_data)
    single_loss = loss_function(y_pred, train_outputs)
    aggregated_losses.append(single_loss)

    if i % 25 == 1:
        print(f"epoch: {i:3} loss: {single_loss.item():10.8f}")

    optimizer.zero_grad()
    single_loss.backward()
    optimizer.step()

print(f"epoch: {i:3} loss: {single_loss.item():10.10f}")

In [None]:
test_outputs = test_outputs.to(device=device, dtype=torch.int64)
with torch.no_grad():
    y_val = model(categorical_test_data)
    loss = loss_function(y_val, test_outputs)
print(f"Loss: {loss:.8f}")

In [None]:
print(y_val[:5])

In [None]:
y_val = np.argmax(y_val, axis=1)
print(y_val[:5])

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(test_outputs, y_val))
print(classification_report(test_outputs, y_val))
print(accuracy_score(test_outputs, y_val))