In [2]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset

class MouseDataset(Dataset):
    def __init__(self, root_dir):
        self.videos_dir = os.path.join(root_dir, 'videos')
        self.pupil_dir = os.path.join(root_dir, 'pupil_center')
        self.behavior_dir = os.path.join(root_dir, 'behavior')
        self.labels_dir = os.path.join(root_dir, 'labels')

        self.file_ids = sorted(
            [f.replace('.npy', '') for f in os.listdir(self.videos_dir) if f.endswith('.npy')],
            key=lambda x: int(x)
        )

    def __len__(self):
        return len(self.file_ids)

    def __getitem__(self, i):
        file_id = self.file_ids[i]

        video = np.load(os.path.join(self.videos_dir, f'{file_id}.npy'))
        pupil = np.load(os.path.join(self.pupil_dir, f'{file_id}.npy'))
        behavior = np.load(os.path.join(self.behavior_dir, f'{file_id}.npy'))
        label = np.load(os.path.join(self.labels_dir, f'{file_id}.npy'))

        video = video[np.newaxis, ...]
        
        return {
            'video': torch.from_numpy(video).float(),
            'pupil_center': torch.from_numpy(pupil).float().transpose(1,0),
            'behavior': torch.from_numpy(behavior).float().transpose(1,0),
            'labels': torch.from_numpy(label).float()
        }

In [3]:
train_set = MouseDataset('/kaggle/input/mousedatasetct/train/data')
validation_set = MouseDataset('/kaggle/input/mousedatasetct/validation/data')
test_set = MouseDataset('/kaggle/input/mousedatasetct/test/data')

In [4]:
train_set[0]['labels'].shape

torch.Size([227, 140])

# BASELINE

**N.B:** TRAIN\_SET

* 0–41: indices associated with clips from *natural\_movie\_one* → one trial every 6
* 42–216: indices associated with clips from *natural\_movie\_three* → one trial every 25



In [5]:
import numpy as np
import statistics

def retrieve_dff_mean_among_trials_for_a_clip(movie_type: str , train_set: MouseDataset, clip: int):
    """
    movie_type:
        - "one"
        - "three"
    
    clip:
        - "one": from 1 to 6 (files from 0.npy to 5.npy), return with clip=2: [1.npy, 7.npy, 13.npy, ... , 37.npy]
        - "three": from 1 to 25 (files from 42.npy to 66.npy), return with clip=2: [43.npy, 68.npy, ... , 212.npy]
    """
    
    offset = 6 if movie_type=="one" else 25

    # Check that parameters passed are consistent with the movie_type
    if movie_type == "one":
        # indices from 0 to 41
        assert 1 <= clip <= 6, f"Per movie_type='one', clip deve essere tra 1 e 6, ricevuto: {clip}"
    elif movie_type == "three":
        # indices from 42 to 216 
        assert 1 <= clip <= 25, f"Per movie_type='three', clip deve essere tra 1 e 25, ricevuto: {clip}"
    else:
        raise ValueError(f"movie_type deve essere 'one' o 'three', ricevuto: {movie_type}")
    
    trials = []
    start_idx = clip-1 if movie_type == "one" else clip+41
    for i in range(7):
        trials.append(train_set[start_idx]['labels'].numpy())
        start_idx += offset
    
    # Mean across trials: (7, 227, 140) → (227, 140)
    mean_across_trials = np.mean(trials, axis=0)
    return mean_across_trials

In [6]:
print(len(test_set)) # 62 → 12 for "one" (first 6 for trial 8) and 50 for "three" (first 25 for trial 8)

62


In [7]:
# here we retrieve trials 8 and 9 of the test for clip 1 of movie one
# N.B: once we compare with trial 8 and another time with trial 9

clip_1_type_one_test_trial8 = test_set[0]['labels'] 
clip_1_type_one_test_trial9 = test_set[6]['labels'] 

In [8]:
# we retrieve trials 8 and 9 of the test for clip 1 of movie one
# N.B: once we compare with trial 8 and another time with trial 9

clip_1_type_three_test_trial8 = test_set[13]['labels'] 
clip_1_type_three_test_trial9 = test_set[38]['labels'] 

In [9]:
# First cell: calculation of the Pearson correlation between clip_1_type_one_train and test trials

import numpy as np

# Then neuron-by-neuron correlation
def compute_neuron_correlations(mean_training, test_trial):
    """Calculates neuron-by-neuron correlation"""
    correlations = []

    print(mean_training.shape)
    for neuron in range(mean_training.shape[0]):
        # Each neuron has 140 time points
        train_neuron = mean_training[neuron, :]  # shape (140,)
        test_neuron = test_trial[neuron, :]      # shape (140,)
        
        # Check variance
        if np.var(train_neuron) > 1e-10 and np.var(test_neuron) > 1e-10:
            corr = np.corrcoef(train_neuron, test_neuron)[0, 1]
            if not np.isnan(corr) and np.isfinite(corr):
                correlations.append(corr)
    
    return {
        'mean_correlation': np.mean(correlations) if correlations else 0.0,
        'valid_neurons': len(correlations),
        'total_neurons': mean_training.shape[0]
    }

In [None]:
def pearson_correlation(predictions, labels):
             
    # Temporal alignment for evaluation
    min_frames = 66
    labels_aligned = labels[..., -min_frames:]
    predictions_aligned = predictions[..., -min_frames:]
    
    y_true = labels_aligned.transpose(1, 0) # (time, neurons)
    y_pred = predictions_aligned.transpose(1,0) # (time, neurons)
        
    correlations = []
        
    for neuron in range(y_true.shape[1]): 
        # for each neuron, retrieve prediction and label
        true_vals = y_true[:, neuron]
        pred_vals = y_pred[:, neuron]
        
        true_var = np.var(true_vals)
        pred_var = np.var(pred_vals)
        
        """
        we filter out neurons with practically constant activity (variance near zero) before calculating the correlation.
        This avoids division by zero in the Pearson correlation formula.
        "Dead" or inactive neurons, common in neural datasets, would have mathematically undefined correlations,
        so they are excluded from metric calculations. Only neurons with significant variability
        both in real data and predictions contribute to the final average correlation metric,
        ensuring statistically valid results.
        """

        if true_var > 1e-10 and pred_var > 1e-10:
            corr = np.corrcoef(true_vals, pred_vals)[0, 1]
            if not np.isnan(corr) and np.isfinite(corr):
                correlations.append(corr)
    
    mean_correlation = np.mean(correlations) if correlations else 0.0
        
    return {
        'eval_single_trial_correlation': mean_correlation,
    }

## SAMPLE-BASED APPROACH


For the baseline calculation, we proceeded as follows:
Our train\_set includes 7 trials (from 0 to 7) for both **natural\_movie\_one** and **natural\_movie\_three**. Based on the processing we performed on the dataset (segmentation), we found that the video of type **one** consists of 6 clips, while the video of type **three** consists of 25 clips. Each clip has a shape of (227, 140).


In [11]:
# calculation of the Pearson correlation between clip_1_type_one_train and clip_1_type_one_test_trial8
clip_1_type_one_train = retrieve_dff_mean_among_trials_for_a_clip("one", train_set, 1)
result = pearson_correlation(clip_1_type_one_train, clip_1_type_one_test_trial8.numpy())
print(result) 

{'eval_single_trial_correlation': 0.1503593076233057}


In [12]:
# calculation of the Pearson correlation between clip_1_type_one_train and clip_1_type_one_test_trial9
result = pearson_correlation(clip_1_type_one_train, clip_1_type_one_test_trial9.numpy())
print(result) 

{'eval_single_trial_correlation': 0.15572641561251682}


In [13]:
# calculation of the Pearson correlation between clip_1_type_one_train and clip_1_type_one_test_trial9
clip_1_type_three_train = retrieve_dff_mean_among_trials_for_a_clip("three", train_set, 1)
result = pearson_correlation(clip_1_type_three_train, clip_1_type_three_test_trial8.numpy())
print(result) # 0.01626231821295263

{'eval_single_trial_correlation': 0.014238362417651177}


In [14]:
# calculation of the Pearson correlation between clip_1_type_three_train and clip_1_type_one_test_trial9
result = pearson_correlation(clip_1_type_three_train, clip_1_type_three_test_trial9.numpy())
print(result) # 0.025573463105490166

{'eval_single_trial_correlation': 0.017345757218436755}


So, we retrieved the trials of the same clip in the test\_set (specifically, trials 8 and 9) and finally calculated the Pearson correlation between the previously obtained NumPy array (the average value, for each neuron, of its intensity across the 7 trials, frame by frame, relative to a selected clip) and the clip from trial 8 first, then with the clip from trial 9.


## APPROACH WITH THE AVERAGE OF SINGLE TRIAL CORRELATIONS


After careful consideration, we concluded that calculating the baseline over all clips in the dataset is a more accurate approach. This is because it provides more meaningful information compared to the sample-based approach and because the model training follows this metric (in fact, the `compute_metrics` method called by the HuggingFace Trainer was implemented specifically to return the average of the single trial correlations across all samples in the test\_set).
Consequently, when performing the model's `evaluate()` on the test\_set, the resulting value can be directly compared to this baseline. Specifically, we first calculated the correlation between the train\_set responses and the test\_set clips related to trial 8, and then calculated the correlation between the train\_set responses and the test\_set clips related to trial 9.


In [15]:
list_array_numpy = [] # will have length = 31
for movie_type in ["one","three"]:
    _range = 6 if movie_type == "one" else 25
    for i in range(_range):
        list_array_numpy.append(retrieve_dff_mean_among_trials_for_a_clip(movie_type, train_set, i+1))

print(list_array_numpy[0].shape)
# Convert: we obtain a 3D array from the list of 2D arrays
array_3d_train = np.stack(list_array_numpy, axis=0)
print(array_3d_train.shape)  # (31, 227, 140)

(227, 140)
(31, 227, 140)


In [16]:
# we split the test set into 2 (trial 8 and 9):
# - 31 clips in total: the first 6 for "movie one" and the other 25 (following) for "movie three" <-> trial 8
# ...
trial_8_list = []
trial_9_list = []
for i in range(len(test_set)):
    if i < 31:
        trial_8_list.append(test_set[i]['labels'].numpy())
    else:
        trial_9_list.append(test_set[i]['labels'].numpy())

array_3d_trial8_test = np.stack(trial_8_list, axis=0)
array_3d_trial9_test = np.stack(trial_9_list, axis=0)
print(array_3d_trial8_test.shape)  # (31, 227, 140)
print(array_3d_trial9_test.shape)  # (31, 227, 140)

(31, 227, 140)
(31, 227, 140)


In [None]:
def compute_metrics(predictions, labels):  
    min_frames = 66
    labels_aligned = labels[..., -min_frames:]
    predictions_aligned = predictions[..., -min_frames:]
    
    example_correlations = [] 
    
   
    for example_idx in range(labels_aligned.shape[0]): # labels_aligned.shape[0] = batch size
       
        y_true_example = labels_aligned[example_idx].T  # (time, neurons)
        y_pred_example = predictions_aligned[example_idx].T  # (time, neurons)
        
        correlations = [] 
        

        for neuron in range(y_true_example.shape[1]):
            true_vals = y_true_example[:, neuron]
            pred_vals = y_pred_example[:, neuron]
            
           
            true_var = np.var(true_vals)
            pred_var = np.var(pred_vals)
            
            if true_var > 1e-10 and pred_var > 1e-10:
                corr = np.corrcoef(true_vals, pred_vals)[0, 1]
                if not np.isnan(corr) and np.isfinite(corr):
                    correlations.append(corr)
        
  
        mean_corr_example = np.mean(correlations) if correlations else 0.0
        example_correlations.append(mean_corr_example) 
    
   
    overall_mean = np.mean(example_correlations) if example_correlations else 0.0
    
    return {
        'eval_average_single_trial_correlation': overall_mean,
        'eval_single_trial_std': np.std(example_correlations) if example_correlations else 0.0,
        'eval_num_examples': len(example_correlations),
    }

In [18]:
import statistics

results = [
    compute_metrics(array_3d_train, array_3d_trial8_test)["eval_average_single_trial_correlation"],
    compute_metrics(array_3d_train, array_3d_trial9_test)["eval_average_single_trial_correlation"]
]
print(results)

[0.007475139404624604, 0.039593989755850795]


In [19]:
print(f"Baseline among all data: {statistics.mean(results)} ")

Baseline among all data: 0.0235345645802377 
