In [1]:
%load_ext autoreload
%autoreload 2

# fix system path
import sys
sys.path.append("/home/jovyan/work")

In [2]:
import pandas as pd
import numpy as np

In [3]:
file_url = 'https://raw.githubusercontent.com/aso-uts/applied_ds/master/unit3/dataset/Car%20Evaluation.csv'

In [4]:
df = pd.read_csv(file_url)

In [5]:
df.head()

Unnamed: 0,buying_price,maintenance_cost,doors,persons_capacity,luggage_boot,safety,evaluation
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [6]:
df.shape

(1728, 7)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
buying_price        1728 non-null object
maintenance_cost    1728 non-null object
doors               1728 non-null object
persons_capacity    1728 non-null object
luggage_boot        1728 non-null object
safety              1728 non-null object
evaluation          1728 non-null object
dtypes: object(7)
memory usage: 94.6+ KB


In [8]:
df.describe()

Unnamed: 0,buying_price,maintenance_cost,doors,persons_capacity,luggage_boot,safety,evaluation
count,1728,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3,4
top,vhigh,vhigh,5more,2,big,med,unacc
freq,432,432,432,576,576,576,1210


In [9]:
df.to_csv('../data/raw/car_evaluation.csv', index=False)

In [10]:
df_cleaned = df.copy()

In [11]:
cats_dict = {
    'buying_price': [['low', 'med', 'high', 'vhigh']],
    'maintenance_cost': [['low', 'med', 'high', 'vhigh']],
    'doors': [['2', '3', '4', '5more']],
    'persons_capacity': [['2', '4', 'more']],
    'luggage_boot': [['small', 'med', 'big']],
    'safety': [['low', 'med', 'high']],
    'evaluation': [['unacc', 'acc', 'good', 'vgood']],
}

In [12]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

In [13]:
for col, cats in cats_dict.items():
    col_encoder = OrdinalEncoder(categories=cats)
    df_cleaned[col] = col_encoder.fit_transform(df_cleaned[[col]])

In [14]:
num_cols = ['buying_price', 'maintenance_cost', 'doors', 'persons_capacity', 'luggage_boot', 'safety']

In [15]:
sc = StandardScaler()

In [16]:
df_cleaned[num_cols] = sc.fit_transform(df_cleaned[num_cols])

In [17]:
df_cleaned['evaluation'] = df_cleaned['evaluation'].astype(int)

In [22]:
df_cleaned.describe()

Unnamed: 0,buying_price,maintenance_cost,doors,persons_capacity,luggage_boot,safety,evaluation
count,1728.0,1728.0,1728.0,1728.0,1728.0,1728.0,1728.0
mean,0.0,0.0,1.6447750000000002e-17,0.0,0.0,0.0,0.414931
std,1.000289,1.000289,1.000289,1.000289,1.000289,1.000289,0.7407
min,-1.341641,-1.341641,-1.341641,-1.224745,-1.224745,-1.224745,0.0
25%,-0.67082,-0.67082,-0.6708204,-1.224745,-1.224745,-1.224745,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.67082,0.67082,0.6708204,1.224745,1.224745,1.224745,1.0
max,1.341641,1.341641,1.341641,1.224745,1.224745,1.224745,3.0


In [19]:
from src.data.sets import split_sets_random, save_sets

In [23]:
X_train, y_train, X_val, y_val, X_test, y_test = split_sets_random(df_cleaned, target_col='evaluation', test_ratio=0.2, to_numpy=True)

In [25]:
save_sets(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, X_test=X_test, y_test=y_test, path='../data/processed/car_evaluation/')

In [26]:
# Solution:
from src.models.pytorch import PytorchDataset

train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

In [27]:
from src.models.null import NullModel

In [28]:
baseline_model = NullModel(target_type='classification')
y_base = baseline_model.fit_predict(y_train)

In [29]:
from src.models.performance import print_class_perf

In [30]:
print_class_perf(y_base, y_train, set_name='Training', average='weighted')

Accuracy Training: 0.6988416988416989
F1 Training: 0.5749561249561249


In [31]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [32]:
from src.models.pytorch import PytorchMultiClass

model = PytorchMultiClass(X_train.shape[1])

In [33]:
from src.models.pytorch import get_device

device = get_device()
model.to(device)

PytorchMultiClass(
  (layer_1): Linear(in_features=6, out_features=32, bias=True)
  (layer_out): Linear(in_features=32, out_features=4, bias=True)
  (softmax): Softmax(dim=1)
)

In [34]:
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [36]:
def train_classification(train_data, model, criterion, optimizer, batch_size, device, scheduler=None, generate_batch=None):
    """Train a Pytorch multi-class classification model

    Parameters
    ----------
    train_data : torch.utils.data.Dataset
        Pytorch dataset
    model: torch.nn.Module
        Pytorch Model
    criterion: function
        Loss function
    optimizer: torch.optim
        Optimizer
    bacth_size : int
        Number of observations per batch
    device : str
        Name of the device used for the model
    scheduler : torch.optim.lr_scheduler
        Pytorch Scheduler used for updating learning rate
    collate_fn : function
        Function defining required pre-processing steps

    Returns
    -------
    Float
        Loss score
    Float:
        Accuracy Score
    """
    
    # Set model to training mode
    model.train()
    train_loss = 0
    train_acc = 0
    
    # Create data loader
    data = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)
    
    # Iterate through data by batch of observations
    for feature, target_class in data:

        # Reset gradients
        optimizer.zero_grad()
        
        # Load data to specified device
        feature, target_class = feature.to(device), target_class.to(device)
        
        # Make predictions
        output = model(feature)
        
        # Calculate loss for given batch
        loss = criterion(output, target_class.long())

        # Calculate global loss
        train_loss += loss.item()
        
        # Calculate gradients
        loss.backward()

        # Update Weights
        optimizer.step()
        
        # Calculate global accuracy
        train_acc += (output.argmax(1) == target_class).sum().item()

    # Adjust the learning rate
    if scheduler:
        scheduler.step()

    return train_loss / len(train_data), train_acc / len(train_data)

In [37]:
def test_classification(test_data, model, criterion, batch_size, device, generate_batch=None):
    """Calculate performance of a Pytorch multi-class classification model

    Parameters
    ----------
    test_data : torch.utils.data.Dataset
        Pytorch dataset
    model: torch.nn.Module
        Pytorch Model
    criterion: function
        Loss function
    bacth_size : int
        Number of observations per batch
    device : str
        Name of the device used for the model
    collate_fn : function
        Function defining required pre-processing steps

    Returns
    -------
    Float
        Loss score
    Float:
        Accuracy Score
    """    
    
    # Set model to evaluation mode
    model.eval()
    test_loss = 0
    test_acc = 0
    
    # Create data loader
    data = DataLoader(test_data, batch_size=batch_size, collate_fn=generate_batch)
    
    # Iterate through data by batch of observations
    for feature, target_class in data:
        
        # Load data to specified device
        feature, target_class = feature.to(device), target_class.to(device)
        
        # Set no update to gradients
        with torch.no_grad():
            
            # Make predictions
            output = model(feature)
            
            # Calculate loss for given batch
            loss = criterion(output, target_class.long())

            # Calculate global loss
            test_loss += loss.item()
            
            # Calculate global accuracy
            test_acc += (output.argmax(1) == target_class).sum().item()

    return test_loss / len(test_data), test_acc / len(test_data)

In [38]:
N_EPOCHS = 50
BATCH_SIZE = 32

In [40]:
from torch.utils.data import DataLoader

In [41]:
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset, model=model, criterion=criterion, optimizer=optimizer, batch_size=BATCH_SIZE, device=device)
    valid_loss, valid_acc = test_classification(val_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.1f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.1f}%')

Epoch: 0
	(train)	|	Loss: 0.0315	|	Acc: 74.3%
	(valid)	|	Loss: 0.0290	|	Acc: 82.9%
Epoch: 1
	(train)	|	Loss: 0.0293	|	Acc: 82.2%
	(valid)	|	Loss: 0.0275	|	Acc: 87.9%
Epoch: 2
	(train)	|	Loss: 0.0287	|	Acc: 83.9%
	(valid)	|	Loss: 0.0277	|	Acc: 87.3%
Epoch: 3
	(train)	|	Loss: 0.0292	|	Acc: 82.6%
	(valid)	|	Loss: 0.0291	|	Acc: 82.7%
Epoch: 4
	(train)	|	Loss: 0.0295	|	Acc: 81.5%
	(valid)	|	Loss: 0.0281	|	Acc: 85.8%
Epoch: 5
	(train)	|	Loss: 0.0292	|	Acc: 82.7%
	(valid)	|	Loss: 0.0279	|	Acc: 86.7%
Epoch: 6
	(train)	|	Loss: 0.0290	|	Acc: 82.9%
	(valid)	|	Loss: 0.0274	|	Acc: 88.2%
Epoch: 7
	(train)	|	Loss: 0.0293	|	Acc: 82.3%
	(valid)	|	Loss: 0.0300	|	Acc: 79.5%
Epoch: 8
	(train)	|	Loss: 0.0292	|	Acc: 82.6%
	(valid)	|	Loss: 0.0283	|	Acc: 85.5%
Epoch: 9
	(train)	|	Loss: 0.0290	|	Acc: 83.7%
	(valid)	|	Loss: 0.0281	|	Acc: 85.8%
Epoch: 10
	(train)	|	Loss: 0.0289	|	Acc: 83.7%
	(valid)	|	Loss: 0.0281	|	Acc: 85.8%
Epoch: 11
	(train)	|	Loss: 0.0291	|	Acc: 83.2%
	(valid)	|	Loss: 0.0282	|	Acc: 85.5%
Ep

In [42]:
torch.save(model, "../models/pytorch_multi_car_evaluation.pt")

In [43]:
test_loss, test_acc = test_classification(test_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)
print(f'\tLoss: {test_loss:.4f}\t|\tAccuracy: {test_acc:.1f}')

	Loss: 0.0283	|	Accuracy: 0.9
