In [2]:
import os

subj = "Subj1"
exp = "exp_ae"
os.makedirs(f"{subj}/{exp}", exist_ok = True)

In [3]:
import abc
import typing
import random
import warnings

import SDA
import SDA.analytics
import SDA.clustquality

import tqdm
import torch
import wandb
import numpy
import pandas
import torch.utils.data
import sklearn.manifold
import sklearn.preprocessing
import sklearn.decomposition

warnings.filterwarnings("ignore")

device = torch.device(
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)
print(device)

RANDOM_STATE = 42
def set_random_seed(seed = RANDOM_STATE):
    random.seed(seed)
    numpy.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.backends.cudnn.deterministic = True

cuda


In [4]:
edges_true = numpy.loadtxt(f"{subj}/reproduction/internal/best_edges.txt").astype(numpy.int32)
features = pandas.read_feather(f'{subj}/exp_final/all_features.feather')
print(features.shape)

features = sklearn.preprocessing.StandardScaler().fit_transform(features)
print(features.shape)

(1046, 19563)
(1046, 19563)


### AE

In [5]:
class BaseEncoder(torch.nn.Module, abc.ABC):
    def __init__(self, latent_size: int):
        super().__init__()
        self.latent_size = latent_size

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        return self.model(input)

class BaseDecoder(torch.nn.Module, abc.ABC):
    def __init__(self, latent_size: int):
        super().__init__()
        self.latent_size = latent_size

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        return self.model(input)
    
class BaseAutoEncoder(torch.nn.Module, abc.ABC):
    def __init__(self, latent_size: int, encoder_class = BaseEncoder, decoder_class = BaseDecoder):
        super().__init__()
        self.latent_size = latent_size
        if encoder_class is not None: self.encoder = encoder_class(latent_size)
        if decoder_class is not None: self.decoder = decoder_class(latent_size)

    def forward(self, input: torch.Tensor) -> typing.Tuple[torch.Tensor, torch.Tensor]:
        embedding = self.encoder(input)
        recovered_input = self.decoder(embedding)
        return recovered_input, embedding

In [6]:
def test_encoder(encoder: BaseEncoder):
    dummy = torch.randn((10, 19563))
    assert encoder(dummy).shape == (10, encoder.latent_size)

def test_decoder(decoder: BaseDecoder):
    dummy = torch.randn((10, decoder.latent_size))
    assert decoder(dummy).shape == (10, 19563)

def test_autoencoder(autoencoder: BaseAutoEncoder):
    test_encoder(autoencoder.encoder)
    test_decoder(autoencoder.decoder)
    dummy = torch.randn((10, 19563))
    outputs, embeddings = autoencoder(dummy)
    assert outputs.shape == dummy.shape
    assert embeddings.shape == (10, autoencoder.latent_size)

In [7]:
def autoencoder_loss(input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
    return torch.nn.functional.mse_loss(input, target, reduction = 'sum')

In [8]:
def train(
    model: BaseAutoEncoder,
    dataloader: torch.utils.data.DataLoader,
    name: str,
    learning_rate: float = 1e-3,
    n_epochs: int = 25
) -> BaseAutoEncoder:
    model = model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_scheduler)
    wandb.init(project = "CourseProject", name = name, anonymous = "allow")
    wandb.watch(model, log = "all")
    for _ in tqdm.trange(n_epochs):
        train_loss = 0
        for images in dataloader:
            model.train() # Enter train mode
            optimizer.zero_grad() # Zero gradients
            output, _ = model(images.to(device)) # Get predictions
            loss = autoencoder_loss(output, images.to(device)) # Calculate loss
            loss.backward() # Calculate gradients
            optimizer.step() # Update weights
            wandb.log({ 'Train batch loss': loss.item() / images.shape[0] }) # Log metric
            train_loss += loss.item()

        scheduler.step()
        new_lr = optimizer.param_groups[0]['lr']
        wandb.log({ 'Train loss': train_loss / len(dataloader.dataset), 'Learning rate': new_lr })

    wandb.finish()
    return model.cpu()

In [9]:
class SimpleEncoder(BaseEncoder):
    def __init__(self, latent_size: int):
        super().__init__(latent_size)
        self.model = torch.nn.Sequential(
            torch.nn.Linear(in_features = 19563, out_features = 4096), torch.nn.BatchNorm1d(4096), torch.nn.GELU(),
            torch.nn.Linear(in_features = 4096, out_features = latent_size)
        )

class SimpleDecoder(BaseDecoder):
    def __init__(self, latent_size: int):
        super().__init__(latent_size)
        self.model = torch.nn.Sequential(
            torch.nn.Linear(in_features = latent_size, out_features = 4096), torch.nn.BatchNorm1d(4096), torch.nn.GELU(),
            torch.nn.Linear(in_features = 4096, out_features = 19563)
        )

test_encoder(SimpleEncoder(128))
test_decoder(SimpleDecoder(128))
test_autoencoder(BaseAutoEncoder(128, SimpleEncoder, SimpleDecoder))

In [18]:
set_random_seed()

def lr_scheduler(epoch: int):
    if epoch < 400: return 1
    return 0.01

torch_features = torch.tensor(features, dtype = torch.float32)
print(torch_features.shape)

dataloader = torch.utils.data.DataLoader(torch_features, shuffle = True, batch_size = 1046)
print(len(dataloader))

simple_autoencoder = train(
    BaseAutoEncoder(64, SimpleEncoder, SimpleDecoder),
    dataloader,
    name = str(64),
    n_epochs = 500
)

torch.Size([1046, 19563])
1


100%|██████████| 500/500 [04:39<00:00,  1.79it/s]


VBox(children=(Label(value='0.001 MB of 0.005 MB uploaded\r'), FloatProgress(value=0.24676258992805755, max=1.…

0,1
Learning rate,████████████████████████████████▁▁▁▁▁▁▁▁
Train batch loss,█▆▅▄▄▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Train loss,█▆▅▄▄▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Learning rate,1e-05
Train batch loss,64.62622
Train loss,64.62622


### Test

In [12]:
stats = [ ]
for n_dim in [ 8, 16, 32, 48, 64 ]:
    set_random_seed()
    simple_autoencoder = train(
        BaseAutoEncoder(n_dim, SimpleEncoder, SimpleDecoder),
        name = str(n_dim),
        n_epochs = 75
    )

    with torch.no_grad():
        features_reduced = simple_autoencoder.encoder.eval().to(device)(torch_features.to(device)).cpu().detach().numpy()
    result, df_st_edges = SDA.SDA(scale = False).apply(features_reduced)

    metrics = [ ]
    for row in result['St_edges']:
        metrics.append(SDA.clustquality.cluster_metrics_ground(edges_true, row))
    result = pandas.concat([ result, pandas.DataFrame(metrics) ], axis = 1)
    
    best_result = SDA.analytics.best_result(result, key = 'Avg-Silh', n_stages = 9)
    best_result["n_dim"] = str(n_dim)
    best_result["Key"] = 'Avg-Silh'
    stats.append(best_result)
    
    best_result = SDA.analytics.best_result(result, key = 'FMI', n_stages = 9)
    best_result["n_dim"] = str(n_dim)
    best_result["Key"] = 'FMI'
    stats.append(best_result)

df = pandas.DataFrame(stats)[["n_dim", "Key", "Ward_dist", "Cen_dist", "Avg-Silh", "Avg-Cal-Har", "Avg-Dav-Bold", "AMI", "ARI", "FMI"]]
df

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01127777777777131, max=1.0)…

100%|██████████| 75/75 [02:40<00:00,  2.14s/it]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Learning rate,██████████████████████████▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Train batch loss,▇█▅▆▇▅▆▄▅▄▃▅▅▅▃▃▂▄▄▄▅▃▃▂▂▂▃▂▂▂▂▃▂▁▂▃▂▂▂▁
Train loss,█▆▅▅▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Learning rate,1e-05
Train batch loss,10494.28835
Train loss,10515.6774


Applying to 1046 samples with 8 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

100%|██████████| 75/75 [02:37<00:00,  2.10s/it]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Learning rate,██████████████████████████▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Train batch loss,██▇▅▅▄▄▄▃▄▃▃▃▃▂▃▃▃▂▂▂▃▃▂▂▂▂▂▂▁▂▁▁▁▁▁▂▁▁▁
Train loss,█▆▅▅▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Learning rate,1e-05
Train batch loss,6741.79261
Train loss,6583.05903


Applying to 1046 samples with 16 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

100%|██████████| 75/75 [02:38<00:00,  2.12s/it]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Learning rate,██████████████████████████▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Train batch loss,█▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
Train loss,█▆▅▅▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Learning rate,1e-05
Train batch loss,3748.69638
Train loss,3282.05579


Applying to 1046 samples with 32 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

100%|██████████| 75/75 [02:36<00:00,  2.09s/it]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Learning rate,██████████████████████████▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Train batch loss,█▆▅▅▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Train loss,█▇▅▅▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Learning rate,1e-05
Train batch loss,2021.4201
Train loss,1805.75021


Applying to 1046 samples with 48 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011288888888884685, max=1.0…

100%|██████████| 75/75 [02:36<00:00,  2.08s/it]


VBox(children=(Label(value='0.001 MB of 0.048 MB uploaded\r'), FloatProgress(value=0.027302939244990148, max=1…

0,1
Learning rate,██████████████████████████▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Train batch loss,██▇▆▅▄▄▃▃▃▂▂▂▂▂▂▂▂▂▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Train loss,█▇▆▅▄▄▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Learning rate,1e-05
Train batch loss,1634.60085
Train loss,1276.50856


Applying to 1046 samples with 64 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

KeyError: "['Algorithm'] not in index"

In [16]:
df = pandas.DataFrame(stats)[["n_dim", "Key", "Ward_dist", "Cen_dist", "Avg-Silh", "Avg-Cal-Har", "Avg-Dav-Bold", "AMI", "ARI", "FMI"]]
df.reset_index()

Unnamed: 0,index,n_dim,Key,Ward_dist,Cen_dist,Avg-Silh,Avg-Cal-Har,Avg-Dav-Bold,AMI,ARI,FMI
0,0,8,Avg-Silh,63204.979657,39.912867,0.159279,23.303689,2.545155,0.733872,0.550863,0.627984
1,1,8,FMI,55208.198538,30.01467,0.086165,18.930499,3.25148,0.842444,0.715677,0.755843
2,2,16,Avg-Silh,40448.21475,33.997199,0.079565,12.108401,3.542746,0.797541,0.691501,0.739387
3,3,16,FMI,37935.596402,25.406092,0.033006,11.470425,4.185454,0.846492,0.749003,0.78443
4,4,32,Avg-Silh,34175.746229,26.042359,0.048314,9.345685,4.790176,0.748241,0.572692,0.633779
5,5,32,FMI,35596.988934,26.827121,0.040357,9.741752,4.41448,0.833275,0.741485,0.778264
6,6,48,Avg-Silh,32697.717407,23.863017,0.036317,9.397524,5.131591,0.819781,0.666313,0.71319
7,7,48,FMI,31293.873774,22.811787,0.02959,9.000188,5.17551,0.838254,0.705386,0.74693
8,8,64,Avg-Silh,31228.548982,22.546235,0.030894,8.765175,5.424188,0.809763,0.631904,0.683191
9,9,64,FMI,31442.310111,22.765921,0.026223,8.815357,5.305457,0.852114,0.742823,0.779325
