In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn

In [29]:
def preprocessing():
    dataset = pd.read_csv("data/car_evaluation.csv")
    
    categorical_columns = ["price", "maint", "doors", "persons", "lug_capacity", "safety"]
    for category in categorical_columns:
        dataset[category] = dataset[category].astype("category")

    price = dataset["price"].cat.codes.values
    maint = dataset["maint"].cat.codes.values
    doors = dataset["doors"].cat.codes.values
    persons = dataset["persons"].cat.codes.values
    lug_capacity = dataset["lug_capacity"].cat.codes.values
    safety = dataset["safety"].cat.codes.values
    
    categorical_data =  np.stack([price, maint, doors, persons, lug_capacity, safety],1)
    categorical_data = torch.tensor(categorical_data, dtype=torch.int64)
    
    categorical_colum_sizes = [len(dataset[column].cat.categories) for column in categorical_columns]
    categorical_embedding_sizes = [(col_size, min(50, (col_size)+1) // 2) for col_size in categorical_colum_sizes]

    outputs = pd.get_dummies(dataset.output).values
    outputs = torch.tensor(outputs).flatten()

    return categorical_data, outputs, categorical_embedding_sizes
    
categorical_data, outputs, categorical_embedding_sizes = preprocessing()

In [11]:
total_records = 1728
test_records = int(total_records * 0.2)

In [12]:
categorical_train_data = categorical_data[:total_records-test_records]
categorical_test_data = categorical_data[total_records-test_records:total_records]
train_outputs = outputs[:total_records-test_records]
test_outputs = outputs[total_records-test_records:total_records]
print(len(categorical_train_data), len(train_outputs), len(categorical_test_data), len(test_outputs))

1383 1383 345 345


In [13]:
class Model(nn.Module):
    def __init__(self, embedding_size, output_size, layers, p=0.4):
        super().__init__()
        self.all_embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in embedding_size])
        self.embedding_dropout = nn.Dropout(p)
        all_layers = []
        num_categorical_cols = sum((nf for ni, nf in embedding_size))
        input_size = num_categorical_cols
        for i in layers:
            all_layers.append(nn.Linear(input_size, i))
            all_layers.append(nn.ReLU(inplace=True))
            all_layers.append(nn.BatchNorm1d(i))
            all_layers.append(nn.Dropout(p))
            input_size = i
        all_layers.append(nn.Linear(layers[-1], output_size))
        self.layers = nn.Sequential(*all_layers)

    def forward(self, x_categorical):
        embeddings = []
        for i, e in enumerate(self.all_embeddings):
            embeddings.append(e(x_categorical[:,i]))
    
        x = torch.cat(embeddings, 1)
        x = self.embedding_dropout(x)
        x = self.layers(x)
        return x          

In [14]:
model = Model(categorical_embedding_sizes, 4, [200, 100, 50], p=0.4)
model

Model(
  (all_embeddings): ModuleList(
    (0-2): 3 x Embedding(4, 2)
    (3-5): 3 x Embedding(3, 2)
  )
  (embedding_dropout): Dropout(p=0.4, inplace=False)
  (layers): Sequential(
    (0): Linear(in_features=12, out_features=200, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=200, out_features=100, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=100, out_features=50, bias=True)
    (9): ReLU(inplace=True)
    (10): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.4, inplace=False)
    (12): Linear(in_features=50, out_features=4, bias=True)
  )
)

In [15]:
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [16]:
categorical_train_data

tensor([[3, 3, 0, 0, 2, 1],
        [3, 3, 0, 0, 2, 2],
        [3, 3, 0, 0, 2, 0],
        ...,
        [1, 3, 3, 0, 1, 1],
        [1, 3, 3, 0, 1, 2],
        [1, 3, 3, 0, 1, 0]])

In [17]:
epochs = 500
aggregated_losses = []
train_outputs = train_outputs.to(device="cpu", dtype=torch.int64)
for i in range(epochs):
    i += 1
    y_pred = model(categorical_train_data)
    single_loss = loss_function(y_pred, train_outputs)
    aggregated_losses.append(single_loss)
    if i % 25 == 1:
        print(f"epoch: {i:3} loss: {single_loss.item():10.8f}")
    optimizer.zero_grad()
    single_loss.backward()
    optimizer.step()
print(f"epoch: {i:3} loss: {single_loss.item():10.10f}")

epoch:   1 loss: 1.64861071
epoch:  26 loss: 1.42616439
epoch:  51 loss: 1.31912148
epoch:  76 loss: 1.21651208
epoch: 101 loss: 1.08108544
epoch: 126 loss: 0.96144873
epoch: 151 loss: 0.82941926
epoch: 176 loss: 0.75922519
epoch: 201 loss: 0.69237274
epoch: 226 loss: 0.66341090
epoch: 251 loss: 0.64297658
epoch: 276 loss: 0.62694794
epoch: 301 loss: 0.61340195
epoch: 326 loss: 0.60012197
epoch: 351 loss: 0.59955049
epoch: 376 loss: 0.58979625
epoch: 401 loss: 0.58858925
epoch: 426 loss: 0.58040482
epoch: 451 loss: 0.57515669
epoch: 476 loss: 0.57506543
epoch: 500 loss: 0.5739462376
