In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
import sys
import os
import copy
import pathlib

import numpy as np
import matplotlib.pyplot as plt
import scipy

import torch
import torchvision
import torchvision.transforms as transforms

# for creating validation set
from sklearn.model_selection import train_test_split

%matplotlib notebook
%matplotlib notebook
# %matplotlib inline
# %matplotlib qt

In [3]:
import sys

sys.path.append('/Users/Josh/Documents/github_repos/')

%load_ext autoreload
%autoreload 2
from basic_neural_processing_modules import torch_helpers, decomposition
from GCaMP_ROI_classifier.old_stuff import util, models, training_simCLR, augmentation

## Import training data

In [4]:
import pickle

with open('/Users/Josh/Documents/Harvard/label_data/all_stat_files_20211022.pkl', 'rb') as file:
    statFiles_scraped = pickle.load(file)

In [5]:
keys_toDelete = [
    '\\\\research.files.med.harvard.edu\\Neurobio\\MICROSCOPE\\Rich\\data\\res2p\\scanimage data\\for Loic\\16x_analysis20200514\\Suite2p and MC for denoised movie\\suite2p\\plane0\\stat.npy',
    '\\\\research.files.med.harvard.edu\\Neurobio\\MICROSCOPE\\Rich\\data\\res2p\\scanimage data\\round 4 experiments\\mouse 6.28\\20201102\\suite2p_accidentally_run_on_everything\\plane0\\stat.npy',
    '\\\\research.files.med.harvard.edu\\Neurobio\\MICROSCOPE\\Rich\\data\\res2p\\scanimage data\\for Loic\\20x\\mouse 2.5\\20200306\\suite2p attemp 2 _tau 1_5\\plane0\\stat.npy',
    '\\\\research.files.med.harvard.edu\\Neurobio\\MICROSCOPE\\Rich\\data\\res2p\\scanimage data\\for Loic\\20x\\mouse 2.5\\20200306\\suite2p attempt 1 _tau 2\\plane0\\stat.npy',
    '\\\\research.files.med.harvard.edu\\Neurobio\\MICROSCOPE\\Rich\\data\\res2p\\scanimage data\\for Loic\\20x\\mouse 2.5\\20200306\\suite2p attempt 3 _tau 1\\plane0\\stat.npy',
    '\\\\research.files.med.harvard.edu\\Neurobio\\MICROSCOPE\\Rich\\data\\res2p\\scanimage data\\for Loic\\AAV\\oldBMIrampExp_20191112_mouse1013A\\ROI extraction\\suite2p\\plane0\\stat.npy',
]
for key in keys_toDelete:
    del statFiles_scraped[key]

In [6]:
sf_list = util.convert_multiple_stat_files(statFiles_dict=statFiles_scraped, out_height_width=[36,36], max_footprint_width=901, plot_pref=False)

images_all = np.concatenate(sf_list, axis=0)

images_all.shape

(217802, 36, 36)

In [7]:
images_all_norm = (images_all / np.sum(images_all, axis=(1,2), keepdims=True)) * 255

In [8]:
# create validation set
# X_train, X_val, y_train, y_val = train_test_split(images[:], labels[:], test_size = 0.15)
X_train, X_val, y_train, y_val = train_test_split(images_all_norm, np.arange(images_all.shape[0]), test_size = 0.15)
(X_train.shape, y_train.shape), (X_val.shape, y_val.shape)

(((185131, 36, 36), (185131,)), ((32671, 36, 36), (32671,)))

In [23]:
# dir_folders = r'/media/rich/bigSSD/for_Josh/GCAMP_ROI_Classifier_data/label_data'
dir_folders = r'/users/Josh/Documents/Harvard/label_data'
folders = [r'mouse 6_28 _ day 20200903/',
             r'mouse6_28 _ day20200815/']
fileNames_statFiles = [r'stat.npy']*len(folders)
paths_statFiles = [pathlib.Path(dir_folders) / folders[ii] / fileNames_statFiles[ii] for ii in range(len(folders))]

statFile_import_kwargs = {'out_height_width': [36,36],
                          'max_footprint_width': 241,
                          'plot_pref':True}

sf_all = util.import_multiple_stat_files(   paths_statFiles=paths_statFiles,
                                        fileNames_statFiles=fileNames_statFiles,
                                        out_height_width=[36,36],
                                        max_footprint_width=241,
                                        plot_pref=True)
images_labeled = np.concatenate(sf_all, axis=0) *255
print(f'concatenated images shape: {images_labeled.shape}')

fileNames_labelFiles = ['labels_posthoc_filledIn_allCells.npy',
             'labels_posthoc_all.npy']
paths_labelFiles = [pathlib.Path(dir_folders) / folders[ii] / fileNames_labelFiles[ii] for ii in range(len(folders))]

labels_all = util.import_multiple_label_files(paths_labelFiles=paths_labelFiles,
                                       plot_pref=True)
labels = np.concatenate(labels_all)

assert np.alltrue([sf_all[ii].shape[0] == labels_all[ii].shape[0] for ii in range(len(sf_all))]) , 'num images in stat files does not correspond to num labels'

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

concatenated images shape: (9715, 36, 36)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Balance classes of inputs

In [25]:
duplicates = 1

images_dup = np.tile(images_labeled , (duplicates , 1 , 1))
labels_dup = np.tile(labels , (duplicates))

print(images_dup.shape)
print(labels_dup.shape)

numToGetTo = np.sum(labels_dup==0)
print(numToGetTo)
for ii in np.array([1,4,5]):
#     idxToDelete = np.cumsum(labels_dup==ii) <= (np.sum(labels_dup==ii) - numToGetTo)
    if ii==5:
        numToGetTo = np.sum(labels_dup==0)/2
    else:
        numToGetTo = np.sum(labels_dup==0)

    idxToDelete = (np.cumsum(labels_dup==ii) * (labels_dup==ii)) > numToGetTo
    images_dup = images_dup[idxToDelete==0,:,:]
    labels_dup = labels_dup[idxToDelete==0]


print(np.sum(labels_dup==0))
print(np.sum(labels_dup==1))
print(np.sum(labels_dup==4))
print(np.sum(labels_dup==5))
print(images_dup.shape)
print(labels_dup.shape)

(9715, 36, 36)
(9715,)
532
532
532
532
266
(1863, 36, 36)
(1863,)


In [27]:
# create validation set
# X_train, X_val, y_train, y_val = train_test_split(images[:], labels[:], test_size = 0.15)
X_train, X_val, y_train, y_val = train_test_split(images_dup[:], labels_dup[:], test_size = 0.15)
(X_train.shape, y_train.shape), (X_val.shape, y_val.shape)

(((1583, 36, 36), (1583,)), ((280, 36, 36), (280,)))

In [28]:
plt.figure()
plt.hist(labels_dup, 20);

<IPython.core.display.Javascript object>

## Define augmentation pipeline

In [10]:
import torchvision.transforms    

transforms = torch.nn.Sequential(
#     torchvision.transforms.RandomHorizontalFlip(p=0.5),
    
#     torchvision.transforms.GaussianBlur(kernel_size=5,
#                                         sigma=(0.0001, 0.5)),
    
#     torchvision.transforms.RandomPerspective(distortion_scale=0.4, 
#                                              p=0.5, 
#                                              interpolation=torchvision.transforms.InterpolationMode.BILINEAR, 
#                                              fill=0),
#     torchvision.transforms.RandomAffine(
#                                         degrees=(-180,180),
#                                         translate=(0.3, 0.3),
#                                         scale=(0.7, 1.7), 
#                                         shear=(-30, 30, -30, 30), 
#                                         interpolation=torchvision.transforms.InterpolationMode.BILINEAR, 
#                                         fill=0, 
#                                         fillcolor=None, 
#                                         resample=None),
    
#     augmentation.AddPoissonNoise(   level_bounds=(0,0.7),
#                                     prob=1),

#     augmentation.AddGaussianNoise(  mean=0, 
#                                     std=0.4,
#                                     prob=1),
)
# scripted_transforms = torch.jit.script(transforms)
scripted_transforms = transforms


dataset_train = util.dataset_simCLR(torch.tensor(X_train), 
                                    y_train, 
                                    n_transforms=2, 
                                    transform=scripted_transforms,
                                    DEVICE='cpu',
                                    dtype_X=torch.float32,
                                    dtype_y=torch.int64 )

dataloader_train = torch.utils.data.DataLoader( dataset_train,
                                                batch_size=1024,
            #                                     sampler=sampler,
                                                shuffle=True,
                                                drop_last=True,
                                                pin_memory=False,
#                                                 num_workers=32,
                                                num_workers=16,
                                                prefetch_factor=3,
                                                persistent_workers=True,
                                                )

# dataset_val = util.dataset_simCLR(torch.tensor(X_val), 
#                                     y_val, 
#                                     n_transforms=2, 
#                                     transform=scripted_transforms,
#                                     DEVICE='cpu',
#                                     dtype_X=torch.float32,
#                                     dtype_y=torch.int64 )

# dataloader_val = torch.utils.data.DataLoader(   dataset_val,
#                                                 batch_size=128,
#             #                                     sampler=sampler,
#                                                 shuffle=True,
#                                                 drop_last=True,
#                                                 pin_memory=True,
#                                                 num_workers=0,
#                                                 )         

## Train

In [11]:
DEVICE = torch_helpers.set_device(use_GPU=True)

no GPU available. Using CPU.


In [12]:
from torch.nn import CrossEntropyLoss
from torch.optim import Adam

model = models.LeNet1(dropout_prob=0.3, momentum_val=0, n_output_features=128)
model.train()
criterion = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=1e-3, weight_decay=0.05)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,
                                                   gamma=1-0.0005,
#                                                    gamma=1,
                                                  )
val_obj = training_simCLR.validation_Obj(   X_val,
                                            y_val,
                                            model,
                                            criterion,
                                            DEVICE=DEVICE,
                                            dtype_X=torch.float32,
                                            dtype_y=torch.int64)

model.to(DEVICE)
criterion = criterion.to(DEVICE)

losses_train, losses_val, val_accs = [], [np.nan], []

In [13]:
# n_epochs=300000
# for epoch in range(n_epochs):
#     print(f'epoch: {epoch}')
#     training_simCLR.epoch_step( dataloader_train, 
#                                 model, 
#                                 optimizer, 
#                                 criterion, 
#                                 scheduler=scheduler, 
#                                 temperature=0.5,
#                                 loss_rolling_train=losses_train, 
#                                 device=DEVICE, 
#                                 do_validation=False,
#                                 validation_Object=val_obj,
#                                 loss_rolling_val=losses_val,
#                                 verbose=2,
#                                 verbose_update_period=10,
#                                )

In [14]:
load_mdl = torch.load('/Users/josh/Documents/Harvard/pretrained/Net_trainedOnAug_20211025_trainingSet_mouse628_20200903and20200815_simCLR.pth',
                 map_location=torch.device('cpu'))
model.load_state_dict(load_mdl)
# model.load_state_dict('/Users/josh/Documents/Harvard/pretrained/Net_trainedOnAug_20211025_trainingSet_mouse628_20200903and20200815_simCLR.pth')

<All keys matched successfully>

In [15]:
plt.figure()
plt.plot(losses_train)

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x7fa8d0c18d00>]

In [64]:
# for v in (dataloader_train):
#     print(v)
#     print(len(v))
#     model(v)
#     break

# model(torch.tensor(X_train[:,None,:,:]).float())
tm = training_classHead.get_simCLR_interim(model, X_train[:, None, :, :])
# tm = training_classHead.get_simCLR_output(model, X_train[:, None, :, :])

In [65]:
tm.shape

(1583, 64)

In [73]:
sys.path.append('/Users/Josh/Documents/github_repos/GCaMP_ROI_classifier/old_stuff/')

import training_classHead


In [114]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

PredictionClass = SVC

# logreg = training_classHead.train_Logistic(model(torch.tensor(X_train[:,None,:,:]).float()).detach().numpy(), y_train)
# logreg = training_classHead.train_logistic()
logreg = training_classHead.train_head(PredictionClass, model, X_train[:, None, :, :], y_train)

In [115]:
logreg.score(training_classHead.get_simCLR_interim(model, X_val[:,None,:,:]), y_val)

0.6392857142857142

In [117]:
prediction_proba = training_classHead.predict_head(model, logreg, X_val[:,None,...])
# prediction = np.argmax(predict_proba_vals, axis=1)
prediction

array([0, 3, 0, 0, 1, 3, 4, 1, 0, 4, 3, 3, 1, 0, 0, 1, 1, 1, 0, 3, 1, 1,
       4, 0, 0, 0, 4, 0, 4, 3, 1, 0, 0, 3, 0, 3, 3, 1, 0, 0, 3, 0, 0, 1,
       3, 3, 1, 0, 3, 0, 3, 0, 0, 0, 3, 0, 3, 3, 0, 0, 0, 1, 3, 0, 3, 0,
       3, 0, 0, 4, 4, 0, 0, 0, 0, 3, 3, 1, 1, 1, 0, 1, 4, 1, 3, 3, 3, 3,
       3, 4, 4, 3, 1, 0, 0, 3, 3, 1, 0, 4, 3, 3, 3, 3, 3, 1, 3, 0, 3, 4,
       1, 0, 0, 0, 1, 1, 3, 0, 3, 3, 4, 3, 0, 3, 1, 0, 0, 0, 3, 0, 1, 3,
       3, 4, 3, 1, 0, 3, 3, 3, 1, 3, 4, 3, 3, 0, 1, 0, 0, 0, 1, 0, 0, 3,
       0, 3, 3, 4, 0, 0, 0, 0, 1, 0, 0, 4, 3, 0, 0, 0, 0, 3, 0, 3, 3, 3,
       0, 0, 3, 0, 1, 0, 3, 3, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 3, 0, 3, 0,
       3, 3, 3, 1, 1, 0, 0, 0, 3, 0, 0, 3, 0, 1, 4, 3, 0, 3, 0, 4, 3, 3,
       3, 0, 0, 3, 0, 1, 0, 0, 1, 3, 4, 3, 0, 3, 4, 4, 4, 0, 0, 0, 3, 3,
       0, 1, 3, 1, 1, 0, 4, 0, 0, 3, 3, 0, 3, 3, 4, 4, 0, 3, 3, 0, 3, 3,
       0, 0, 0, 0, 4, 3, 1, 0, 0, 0, 3, 3, 0, 0, 3, 0])

In [118]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_val, prediction)

array([[64, 12,  2,  0,  0],
       [41, 22, 16,  0,  0],
       [ 0,  0,  0,  0,  0],
       [11,  8, 58,  1,  0],
       [ 0,  0, 18, 27,  0]])

In [119]:
import pandas as pd
with pd.option_context('max_rows', 3000):
    display(pd.DataFrame(np.concatenate([y_val.reshape(-1,1), prediction.reshape(-1,1)], axis=1)))

Unnamed: 0,0,1
0,0,0
1,4,3
2,1,0
3,4,0
4,1,1
5,4,3
6,5,4
7,0,1
8,0,0
9,5,4


In [103]:
# training_classHead.get_simCLR_interim(model, X_val[:,None,:,:])

## Check embeddings

In [None]:
torch.cuda.empty_cache()

In [None]:
features = model(torch.tensor(images_labeled, device=DEVICE, dtype=torch.float32)[:,None,...]).detach()
# features = model(torch.tensor(X_val, device=DEVICE, dtype=torch.float32)[:,None,...]).detach()
# features = model(torch.tensor(X_train, device=DEVICE, dtype=torch.float32)[:,None,...]).detach()

In [None]:
_, features_embedded, _, _ = decomposition.torch_pca(features, device=DEVICE, return_cpu=True)

In [None]:
from sklearn import manifold

tsne = manifold.TSNE(n_components=2, 
                     perplexity=90.0, 
                     early_exaggeration=12.0, 
                     learning_rate=200, 
                     n_iter=1000, 
                     n_iter_without_progress=300, 
                     min_grad_norm=1e-07, 
                     metric='euclidean', 
                     init='pca', 
                     verbose=0, 
                     random_state=None, 
                     method='barnes_hut', 
                     angle=0.5, 
                     n_jobs=-1, 
#                      square_distances='legacy'
                    )
features_embedded = tsne.fit_transform(features.cpu())

In [None]:
features_embedded

In [None]:
import matplotlib as mpl

mpl.rcParams['image.cmap'] = 'Set1'
%matplotlib notebook
plt.figure()
plt.scatter(features_embedded[:,0], features_embedded[:,1], c=labels)
# plt.scatter(features_embedded[:,0], features_embedded[:,1], c=y_val)
# plt.scatter(features_embedded[:,4], features_embedded[:,5], c=y_train)
# plt.scatter(features_embedded[:,11], features[:,43].cpu(), c=y_train)
mpl.rcParams['image.cmap'] = 'viridis'


## Check filters

In [None]:
list(model.state_dict().keys())

In [None]:
layer_1 = model.state_dict()['cnn_layers.0.weight'].cpu()
layer_2 = model.state_dict()['cnn_layers.2.weight'].cpu()
layer_3 = model.state_dict()['cnn_layers.5.weight'].cpu()
layer_4 = model.state_dict()['cnn_layers.8.weight'].cpu()

plt.figure()
for ii in range(layer_1.shape[1]):
    for jj in range(layer_1.shape[0]):
        plt.subplot2grid((layer_1.shape[1],layer_1.shape[0]),(ii,jj))
        fig = plt.imshow(layer_1[jj,ii,:,:] , clim=(-0.2,0.2))
        fig.axes.get_xaxis().set_visible(False)
        fig.axes.get_yaxis().set_visible(False)

plt.figure()
for ii in range(layer_2.shape[1]):
    for jj in range(layer_2.shape[0]):
        plt.subplot2grid((layer_2.shape[1],layer_2.shape[0]),(ii,jj))
        fig = plt.imshow(layer_2[jj,ii,:,:], clim=(-.05,.05))
        fig.axes.get_xaxis().set_visible(False)
        fig.axes.get_yaxis().set_visible(False)

plt.figure()
for ii in range(16):
    for jj in range(16):
        plt.subplot2grid((16,16),(ii,jj))
        fig = plt.imshow(layer_3[jj, ii,:,:])
        fig.axes.get_xaxis().set_visible(False)
        fig.axes.get_yaxis().set_visible(False)
        
plt.figure()
for ii in range(16):
    for jj in range(16):
        plt.subplot2grid((16,16),(ii,jj))
        fig = plt.imshow(layer_4[jj, ii,:,:])
        fig.axes.get_xaxis().set_visible(False)
        fig.axes.get_yaxis().set_visible(False)

In [None]:
# torch.save(model.state_dict(), '/media/rich/bigSSD/Net_trainedOnAug_20211025_trainingSet_mouse628_20200903and20200815_simCLR.pth')

In [None]:
# model = Net()
# model.load_state_dict(torch.load('test_save.pth'))
# model.eval()