Use GridSearchCV to fit the model with the design matrix and compare to the existing `"beta"` values.

In [1]:
import re
import itertools
from pathlib import Path
import scipy
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

Splitting function from regbench repo:

In [2]:
def split_by_trials(data, opts):
    '''
    Make index that contain trials instead of random data points. 
    Based on Max Melin's ridge-model-matlab-baseline package.
    '''
    rng = np.random.default_rng(seed=4)
    trial_idx = rng.permutation(int(len(data) / opts['frames_per_trial']))
    frame_idx = np.arange(len(data))
    frame_idx = frame_idx.reshape(-1, opts['frames_per_trial'])
    frame_idx = frame_idx[trial_idx, :].flatten()
    out_sizes = np.full(opts['out_folds'], len(data) // opts['out_folds'], dtype=int)
    out_sizes[: len(data) % opts['out_folds']] += 1
    in_sizes = np.full(opts['in_folds'], (len(data) - out_sizes[0]) // opts['in_folds'], dtype=int)
    in_sizes[: (len(data) - out_sizes[0]) % opts['in_folds']] += 1
    out_split = []
    in_split = []
    out_current = 0
    for out_fold, out_size in enumerate(out_sizes):
        start, stop = out_current, out_current + out_size
        test_idx = frame_idx[start:stop]
        train_idx = np.append(frame_idx[:start], frame_idx[stop:])
        out_split.append((train_idx, test_idx))
        out_current = stop
        if out_fold == 0:
            frames_range = np.arange(len(data)-out_sizes[0])
            in_current = 0
            for in_size in in_sizes:
                start, stop = in_current, in_current + in_size
                test_idx = frames_range[start:stop]
                train_idx = np.append(frames_range[:start], frames_range[stop:])
                in_split.append((train_idx, test_idx))
                in_current = stop

    return out_split, in_split

In [3]:
data_path = Path('../data/mSM63_09-Aug-2018.mat')
mat = scipy.io.loadmat(data_path)
mat.keys()

dict_keys(['__header__', '__version__', '__globals__', 'Vm', 'zeromeanVc', 'U', 'R', 'betas', 'lambdas', 'cMap', 'cMovie', 'regLabels', 'regIdx', 'rejIdx', 'regZeroFrames'])

prepare design matrix

In [4]:
reg_labels = [label[0] for label in mat['regLabels'][0] ]
reg_labels = [re.sub(r'(?<!^)(?=[A-Z])', '_', label).lower() \
            for label in reg_labels] # Convert to snake case
reg_idx = np.array(mat['regIdx'])-1 # -1 to convert to Python indexing
design_df_columns = list(itertools.chain.from_iterable([[reg_label]*\
                        len(np.where(reg_idx == reg_id)[0]) \
                        for reg_id, reg_label in enumerate(reg_labels)]))
design_df = pd.DataFrame(data = mat["R"], columns = design_df_columns)
design_df

Unnamed: 0,time,time.1,time.2,time.3,time.4,time.5,time.6,time.7,time.8,time.9,...,bhv_video,bhv_video.1,bhv_video.2,bhv_video.3,bhv_video.4,bhv_video.5,bhv_video.6,bhv_video.7,bhv_video.8,bhv_video.9
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001844,-0.005543,-0.008064,0.009798,0.002812,0.009011,-0.010559,0.019880,0.006008,0.000872
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.004605,-0.006546,-0.007706,0.012942,0.007214,0.012459,-0.013270,0.016783,0.005209,0.001186
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.005599,-0.008697,-0.003434,0.011454,0.009401,0.011981,-0.011696,0.011270,0.006183,-0.000172
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.004297,-0.011377,0.003280,0.008093,0.009040,0.007144,-0.009413,0.005309,0.007684,-0.001411
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.002602,-0.013337,0.007815,0.005257,0.009352,0.005346,-0.007959,0.000080,0.006899,-0.003180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001023,0.005589,0.008768,-0.005193,0.004900,-0.002654,0.006261,0.005281,-0.000314,-0.004324
28196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000107,0.002162,0.009085,-0.001206,0.001714,-0.001693,0.005055,0.003489,0.000316,-0.005172
28197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001409,0.000819,0.006901,0.003610,0.000480,-0.000849,0.005850,0.001271,0.002670,-0.005505
28198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001392,0.002767,0.002445,0.005871,-0.000863,-0.000019,0.005080,-0.000186,0.006281,-0.004943


prepare data and parameters

In [5]:
opts = {}
data = mat['zeromeanVc'].T
opts['frames_per_trial'] = 75 # Num of frames per trial
opts['out_folds'] = 10 # Num of folds for outer (test) cross-validation
opts['in_folds'] = 5 # Num of folds for inner (train) cross-validation
opts

{'frames_per_trial': 75, 'out_folds': 10, 'in_folds': 5}

split data for traing

In [6]:
out_split, in_split = split_by_trials(data, opts) # Split data into outer and inner folds by trials
train_idx, _ = out_split[0]

fit model

In [None]:
#alphas = np.concatenate([np.arange(1, 2000, 500), np.arange(3000, 13000,2000), np.arange(15000, 30000, 5000)])
mdl = RidgeCV(alphas=[2000, 10000], scoring='r2', alpha_per_target=False)
pipeline = make_pipeline(StandardScaler(), mdl)
pipeline.fit(design_df.loc[train_idx], data[train_idx, :])
mdl.alpha_
