In [67]:
from self_supervised_halos.utils.utils import data_preprocess_path, check_cuda
from scripts.classification_3d import ClassificationModel, report_classification_performance

from self_supervised_halos.utils.dataloader import HaloDataset, img3d_transform, subhalos_df, DataLoader

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim

device = check_cuda()

CUDA is not available.
Device: cpu


# Data loaders

In [None]:
dataset = HaloDataset(root_dir=data_preprocess_path,subhalos_df=subhalos_df, 
                      load_2d=False, load_3d=True, load_mass=False,
                        choose_two_2d = False,
                      DEBUG_LIMIT_FILES = None)
                  

In [70]:
if device=='cpu':
    batch_size = 128
else:
    print('dataloader on gpu')
    batch_size = 512

print(f'Batch size: {batch_size}')

n_data = len(dataset)
f_train = 0.6
f_val = 0.2
f_test = 1 - f_train - f_val


train_size = int(f_train*len(dataset))
val_size = int(f_val*len(dataset))
test_size = len(dataset) - train_size - val_size


trainval_ds, test_ds = torch.utils.data.random_split(dataset, [train_size+val_size, test_size])
train_size = int(f_train/(f_train+f_val)*len(trainval_ds))
val_size = len(trainval_ds) - train_size

train_ds, val_ds = torch.utils.data.random_split(trainval_ds, [train_size, val_size])

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=True)


Batch size: 128


# Training loop

In [72]:
lr = 5e-3
n_epochs=5


criterion = nn.CrossEntropyLoss(weight=dataset.mass_bins_weights.to(device)).to(device)

model = ClassificationModel(
                    optimizer_class=torch.optim.Adam,
                    optimizer_params={'lr':lr},
                    scheduler_class=torch.optim.lr_scheduler.StepLR,
                    scheduler_params={'step_size':15, 'gamma':0.5},
                    criterion=criterion,
                    history=None,
                    transform=None, #TODO add 3d transform
)

model.load('Classification_3d.pth')

model.trial_forward_pass(train_loader, device, limit_to_first_batch=False);

Model Classification_3d not found at /Users/sdbykov/work/self_supervised_halos//results/models/Classification_3d.pth


Trial Forward Pass: 100%|██████████| 78/78 [00:44<00:00,  1.77it/s]

Trial forward pass elapsed time: 44.16 s (limit_to_first_batch=False)





In [None]:
model.training_loop(
    train_loader=train_loader, 
    val_loader=val_loader,
    num_epochs=n_epochs, 
    device=device)

In [None]:
%matplotlib inline
plt.plot(model.history['train_loss'], label='train')
plt.plot(model.history['val_loss'], label='val')
plt.legend()
plt.show()

In [None]:
model.save()

# Measure performance

In [None]:
model.show_transforms(train_loader, device);

In [None]:
result_df = report_classification_performance(model, val_loader, 
device=device)
report_classification_performance(model, val_loader, device=device, viz_one = True)

pd.crosstab(result_df['true_class'], result_df['pred_class'], margins=True)

# Embeddings

In [None]:
from openTSNE import TSNE #!conda install --channel conda-forge opentsne -y
from tqdm import tqdm 

def gather_cnn_features(model, loader, device, transform = None):
    model.model.eval()
    cnn = model.model.cnn
    features = []
    labels = []
    with torch.no_grad():
        for i, batch in tqdm(enumerate(loader), total=len(loader)):
            data, label = batch
            img = data[0]
            label = label[1]

            img = img.to(device)
            label = label.to(device)
            if transform is not None:
                img = transform(img)


            output = cnn(img).cpu().numpy()
            
            features.append(output)
            labels.append(label.cpu().numpy())
    return np.concatenate(features), np.concatenate(labels)

features, labels = gather_cnn_features(model, train_loader, device, transform = img2d_transform)

In [None]:
tsne = TSNE(
    n_components = 2,
    perplexity=30,
    metric="euclidean",
    n_jobs=8,
    random_state=42,
    verbose=True,
)

features_embedding = tsne.fit(features) 
#timing
#approx 3m30s min for first 1000 example of 78 batches of train dataloader (approx 1000 out of 9000 halos)
#approx 3m30s min for first all examples of 78 batches of train dataloader (approx 9000 halos)

In [None]:
limit_labels = len(features_embedding)

fig, axs = plt.subplots(4, (len(np.unique(labels))+1)//4+1, figsize = (15,15))
axs=axs.flatten()


axs[0].scatter(features_embedding[:limit_labels, 0], features_embedding[:limit_labels, 1], c = 'k', s = 1)

for i,label in enumerate(np.unique(labels)):
    mask = labels[:limit_labels]==label
    axs[i+1].scatter(features_embedding[:limit_labels, 0][mask], features_embedding[:limit_labels, 1][mask], s = 30, alpha = 0.4)

    axs[i+1].set_title(label)

axs[0].set_title('all')

axs[i+1].set_xlabel('tsne1')
axs[0].set_ylabel('tsne2')



In [None]:
plt.figure(figsize=(10,10))
for label in np.unique(labels):
    idx = labels[:1000] == label
    plt.scatter(features_embedding[idx, 0], features_embedding[idx, 1], label=str(label), s=30, alpha = 0.5)

plt.legend()

In [None]:
idx

In [None]:
features_embedding[:,0]