In [1]:
import numpy as np
import pandas as pd

import itertools

import torch
import torch.nn as nn
import torch.optim as optim

from hypll import nn as hnn
from hypll.tensors import TangentTensor
from hypll.optim import RiemannianAdam
from hypll.manifolds.poincare_ball import Curvature, PoincareBall

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [2]:
import util

In [3]:
TRAIN_FILE = '../data/tomato_samples_big.csv'
VAL_FILE = '../data/tomato_val_dataset.csv'

data = pd.read_csv(TRAIN_FILE, index_col=0)
val_data = pd.read_csv(VAL_FILE, index_col=0)

data

Unnamed: 0,Overall Liking,Texture liking,Sweetness,Sourness,Salty,Umami,Tomato Flavor Intenstity,glucose,fructose,Soluble solids,...,citric:malic,3-methyl-1-pentanol,2-ethylfuran,isopentyl acetate,cis-3-hexenyl acetate,benzothiazole,benzyl alcohol,citric acid,3-methyl-2-butenal,p-anisaldehyde
0,0.338914,0.340171,0.256928,0.181355,0.131396,0.130262,0.335056,2.289596,1.929062,2.592386,...,2.520910,1.093684,0.136266,-0.571344,1.210977,1.037262,1.072747,0.980180,-0.146406,-0.119881
1,0.336514,0.340055,0.256104,0.179985,0.131889,0.126163,0.333145,2.290067,1.939249,2.581130,...,2.528238,1.017167,-0.366412,-0.571344,1.290722,0.932396,0.002128,0.923562,-0.089412,0.212553
2,0.336584,0.339619,0.254152,0.180641,0.127664,0.129144,0.333956,2.294738,1.918264,2.566390,...,2.527202,1.181431,-0.219492,-0.571344,1.270815,-0.680615,-1.743087,0.945401,-0.039227,0.479200
3,0.333920,0.339588,0.256608,0.182085,0.127090,0.128752,0.333464,2.286382,1.930326,2.546053,...,2.515032,1.176457,-0.201208,-0.571344,1.235020,0.632055,-0.813754,0.949416,-0.203380,-0.200393
4,0.337627,0.339244,0.254991,0.182370,0.129525,0.129901,0.335189,2.296162,1.925542,2.543229,...,2.513928,1.056964,-0.232769,-0.571344,1.131131,0.481864,0.064032,0.875690,0.082542,-0.653343
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78995,0.037238,0.000000,0.135424,0.089726,0.084163,0.091139,0.191412,-1.257119,-1.677233,-0.141795,...,-1.036429,0.475485,0.489022,0.524832,-2.284104,0.213054,0.853422,-0.918645,-0.385336,-0.182016
78996,0.031069,0.015169,0.135655,0.088629,0.081030,0.096139,0.189275,-1.275015,-1.672763,-0.162537,...,-1.037527,0.299981,0.464641,1.805531,-0.768846,0.050725,-0.115696,-0.946242,-0.126320,0.545937
78997,0.046658,0.043162,0.133811,0.090758,0.082977,0.098215,0.193270,-1.282447,-1.667745,-0.202092,...,-1.055045,-0.715392,1.269892,-2.362999,-0.416522,0.351719,-0.257527,-0.990672,0.347381,0.233870
78998,0.043135,0.005880,0.133449,0.089429,0.083958,0.099179,0.192197,-1.283625,-1.683112,-0.124910,...,-1.047269,1.128747,1.664543,-1.977505,-1.196415,0.026865,0.249716,-1.005686,-0.240351,0.298191


In [4]:
NUM_LABEL_COLS = 7
FEATURE_COLS = data.columns[7:]
LABEL_COLS = data.columns[[0]]
print(FEATURE_COLS)
print(LABEL_COLS)


def get_fold_indices(size, k):
    fold_size = size // k
    rest = size % k
    
    fold_sizes = [fold_size] * k
    
    for i in range(rest):
        fold_sizes[i] += 1

    indices = np.cumsum([fold_sizes])
    
    return list(zip(indices-np.array(fold_sizes), indices))


FOLDS = 3
NUM_SAMPLE_TYPES = len(val_data)
NUM_SAMPLES_PER_TYPE = len(data) // NUM_SAMPLE_TYPES

fold_nums = list(range(FOLDS))
[num*NUM_SAMPLE_TYPES for num in fold_nums]
[(num+1)*NUM_SAMPLE_TYPES for num in fold_nums]

FOLD_INDICIES = get_fold_indices(NUM_SAMPLE_TYPES, FOLDS)

# FOLD_INDICIES = list(zip([num*NUM_SAMPLE_TYPES//FOLDS for num in fold_nums], 
#                          [(num+1)*NUM_SAMPLE_TYPES//FOLDS for num in fold_nums]))

print(FOLD_INDICIES)

ALL_TRAIN_FEATURES = data[FEATURE_COLS].values
ALL_TRAIN_LABELS = data[LABEL_COLS].values
ALL_VAL_FEATURES = val_data[FEATURE_COLS].values
ALL_VAL_LABELS = val_data[LABEL_COLS].values

Index(['glucose', 'fructose', 'Soluble solids', '1-penten-3-one',
       'isovaleronitrile', 'trans-2-pentenal', 'trans-2-heptenal',
       'trans-3-hexen-1-ol', '6-methyl-5-hepten-2-ol', 'nonyl aldehyde',
       'cis-4-decenal', 'sugar:acid', 'isovaleraldehyde', '3-methyl-1-butanol',
       'methional', '2,5-dimethyl-4-hydroxy-3(2H)-furanone', '3-pentanone',
       '1-pentanol', 'benzyl cyanide', 'isovaleric acid', '2-isobutylthiazole',
       '1-nitro-3-methylbutane', 'benzaldehyde', '6-methyl-5-hepten-2-one',
       'b-ionone', 'b-cyclocitral', 'geranial', 'phenylacetaldehyde',
       'eugenol', 'geranylacetone', '2-phenyl ethanol', 'neral',
       'salicylaldehyde', 'isobutyl acetate', 'butyl acetate',
       'cis-3-hexen-1-ol', '1-nitro-2-phenylethane', '1-penten-3-ol',
       '2-methylbutyl acetate', 'heptaldehyde', 'trans,trans-2,4-decadienal',
       'malic acid', '2-methylbutanal', '4-carene', 'hexyl alcohol',
       'guaiacol', 'propyl acetate', 'hexanal', 'cis-2-penten-1-ol'

In [5]:
# Define custom PyTorch dataset
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

<h1> Hyperbolic </h1>

In [6]:
# Define your MLP model
class HYP_MLP(nn.Module):
    def __init__(self, input_size, output_size, layer_size, num_hidden_layers, manifold):
        super(HYP_MLP, self).__init__()
        torch.manual_seed(42)
        self.fc_in = hnn.HLinear(input_size, layer_size, manifold=manifold)
        self.relu = hnn.HReLU(manifold=manifold)
        self.hidden_fcs = nn.ModuleList([hnn.HLinear(layer_size, layer_size, manifold=manifold) for _ in range(num_hidden_layers)])
        self.fc_out = hnn.HLinear(layer_size, output_size, manifold=manifold)

    def forward(self, x):
        x = self.fc_in(x)
        x = self.relu(x)
        for fc in self.hidden_fcs:
            x = fc(x)
            x = self.relu(x)
        x = self.fc_out(x)

        return x


# Define training function
def hyp_train_model(model, train_loader, criterion, optimizer, manifold, device):
    model.train()
    running_loss = 0.0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()

        tangents = TangentTensor(data=inputs, man_dim=-1, manifold=manifold)
        manifold_inputs = manifold.expmap(tangents)

        outputs = model(manifold_inputs)

        loss = criterion(outputs.tensor, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    return running_loss / len(train_loader.dataset)

<h1> EUCLIDEAN </h1>

In [7]:
# Define your MLP model
class EUC_MLP(nn.Module):
    def __init__(self, input_size, output_size, layer_size, num_hidden_layers):
        super(EUC_MLP, self).__init__()
        torch.manual_seed(42)
        self.fc_in = nn.Linear(input_size, layer_size)
        self.relu = nn.ReLU()
        self.hidden_fcs = nn.ModuleList([nn.Linear(layer_size, layer_size) for _ in range(num_hidden_layers)])
        self.fc_out = nn.Linear(layer_size, output_size)

    def forward(self, x):
        x = self.fc_in(x)
        x = self.relu(x)
        for fc in self.hidden_fcs:
            x = fc(x)
            x = self.relu(x)
        x = self.fc_out(x)

        return x

# Define training function
def euc_train_model(model, train_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    return running_loss / len(train_loader.dataset)

In [11]:
# param_grid = {
#     'model_type': ['hyp'],
#     'num_hidden_layers': [0,2,8,12,14,16,18,20],
#     'layer_size': [2,8,16,32,48,64,72,80,96,128,256,512],
#     'lr': [0.018,0.02,0.022],
#     'weight_decay': [0.001],
#     'batch_size': [1024],
#     'epochs': [50],
#     'curvature': [-1]
# }

# param_grid = {
#     'model_type': ['euc'],
#     'num_hidden_layers': [0,1,2,3,4,5,8,12],
#     'layer_size': [2,8,16,64,128,192,256,320,448,480,512,544,576],
#     'lr': [0.003,0.004,0.005],
#     'weight_decay': [0.001],
#     'batch_size': [1024],
#     'epochs': [50],
#     'curvature': [-1]
# }





# param_grid = {
#     'model_type': ['euc'],
#     'num_hidden_layers': [0,1,2,4],
#     'layer_size': [2,4,8,16,32,64,128],
#     'lr': [0.003],
#     'weight_decay': [0.0005],
#     'batch_size': [1024],
#     'epochs': [100],
#     'curvature': [-1]
# }

param_grid = {
    'model_type': ['hyp'],
    'num_hidden_layers': [0,1,2,4],
    'layer_size': [2,4,8,16,32,64,128],
    'lr': [0.005],
    'weight_decay': [0.001],
    'batch_size': [1024],
    'epochs': [100],
    'curvature': [-1]
}


param_combinations = list(itertools.product(*param_grid.values()))
len(param_combinations)

28

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

param_eval_stats = []

for i, params in enumerate(param_combinations):
    print(f'----- Combination {i} -----')
    print(*zip(param_grid.keys(), params))
    model_type, num_hidden_layers, layer_size, lr, weight_decay, batch_size, epochs, curvature = params
    for fold, (fold_start, fold_stop) in enumerate(FOLD_INDICIES):
        print(f'Fold {fold}')

        train_features = ALL_TRAIN_FEATURES[fold_start*NUM_SAMPLES_PER_TYPE:fold_stop*NUM_SAMPLES_PER_TYPE]
        train_labels   =   ALL_TRAIN_LABELS[fold_start*NUM_SAMPLES_PER_TYPE:fold_stop*NUM_SAMPLES_PER_TYPE]
        val_features   = ALL_VAL_FEATURES[fold_start:fold_stop]
        val_labels     =   ALL_VAL_LABELS[fold_start:fold_stop]

        train_dataset = CustomDataset(train_features, train_labels)
        val_dataset = CustomDataset(val_features, val_labels)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)

        if model_type == 'hyp':
            manifold = PoincareBall(c=Curvature(curvature))
        elif model_type == 'euc':
            manifold = None

        if model_type == 'hyp':
            model = HYP_MLP(input_size=len(FEATURE_COLS), 
                            output_size=len(LABEL_COLS), 
                            layer_size=layer_size, 
                            num_hidden_layers=num_hidden_layers, 
                            manifold=manifold).to(device)
        elif model_type == 'euc':
            model = EUC_MLP(input_size=len(FEATURE_COLS), 
                            output_size=len(LABEL_COLS), 
                            layer_size=layer_size, 
                            num_hidden_layers=num_hidden_layers).to(device)
            
        criterion = nn.MSELoss()

        if model_type == 'hyp':
            optimizer = RiemannianAdam(model.parameters(), lr=lr, weight_decay=weight_decay)
        elif model_type == 'euc':
            optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

        eval_stats = {'loss': {'train': [], 'val': []}, 'mae': {'train': [], 'val': []}}

        for epoch in range(epochs):
            if model_type == 'hyp':
                eval_stats['loss']['train'].append(hyp_train_model(model, train_loader, criterion, optimizer, manifold, device))
                eval_stats['loss']['val'].append(util.h_evaluate_loss(model, val_loader, criterion, manifold, device))

                eval_stats['mae']['train'].append(util.h_evaluate_mae(model, train_loader, manifold, device))
                eval_stats['mae']['val'].append(util.h_evaluate_mae(model, val_loader, manifold, device))
            elif model_type == 'euc':
                eval_stats['loss']['train'].append(euc_train_model(model, train_loader, criterion, optimizer, device))
                eval_stats['loss']['val'].append(util.evaluate_loss(model, val_loader, criterion, device))

                eval_stats['mae']['train'].append(util.evaluate_mae(model, train_loader, device))
                eval_stats['mae']['val'].append(util.evaluate_mae(model, val_loader, device))

        print(eval_stats['mae']['val'])
        param_eval_stats.append(eval_stats)

----- Combination 0 -----
('model_type', 'hyp') ('num_hidden_layers', 0) ('layer_size', 2) ('lr', 0.005) ('weight_decay', 0.001) ('batch_size', 1024) ('epochs', 100) ('curvature', -1)
Fold 0
[0.3666735, 0.24763982, 0.20159322, 0.16367118, 0.14097694, 0.12265714, 0.103694685, 0.08165606, 0.05926245, 0.04077773, 0.03378935, 0.030922748, 0.030086419, 0.029816201, 0.029611444, 0.029443307, 0.029330874, 0.029212343, 0.029071327, 0.029015116, 0.028907286, 0.028875073, 0.028839787, 0.028694794, 0.028663697, 0.028618649, 0.028508127, 0.028532483, 0.028435102, 0.028428085, 0.028367428, 0.028249847, 0.028204007, 0.028153649, 0.028071932, 0.028071377, 0.027945586, 0.027911022, 0.027901521, 0.027877752, 0.027787456, 0.02778592, 0.027786605, 0.02770563, 0.027696623, 0.027662745, 0.02764553, 0.027559737, 0.027579026, 0.027543338, 0.027535584, 0.02747341, 0.027445767, 0.027518211, 0.02740056, 0.027408777, 0.027419435, 0.027360015, 0.027353274, 0.027366187, 0.027330112, 0.027338095, 0.02732241, 0.0273