In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from context_analysis.load import load_spike_proba
from context_analysis.reshape import split_by_group
from context_analysis.resample import downsample
from context_analysis.onep.preprocessing import block_from_time, remove_nan_ys, remove_mixed_ys
from context_analysis.onep.reshape import pivot
from sklearn.base import BaseEstimator, clone
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score

In [6]:
def get_X_y(df, resample_interval, session_name, time_col="time", value_col="value", cell_col="cell_id", dropna=True):
    X = pivot(df, cell_col=cell_col, value_col="value", )
    X = downsample(X.reset_index(), time_col=time_col, new_interval=resample_interval)
    X = X.dropna()
    y = block_from_time(df=X, session_name=session_name)
    X = X.set_index("time")
    X, y = remove_nan_ys(X, y)
    X, y = remove_mixed_ys(X, y)
    return X, y

def fit_model(X, y, pipe):
    test_size = int(len(X) * 0.75)
    cv = TimeSeriesSplit(test_size=0.7, n_splits=5)
    score = cross_val_score(pipe, X, y, scoring='f1_macro')
    return score


#### Test2

In [12]:
session_name = "day4-test1"
exp = "pfc"

In [13]:
df = load_spike_proba(experiment=exp, session_names=[session_name])
df = df.dropna()
exp, one, no = split_by_group(df)

In [15]:
resample_interval = "500ms"

clf = LogisticRegression(C=0.01, penalty="elasticnet", solver='saga', l1_ratio=0.3)
pipe = Pipeline([
    ("clf", clf)
])

In [23]:
X, y = get_X_y(exp, resample_interval="500ms", session_name=session_name)
fit_model(X, y, clone(pipe))

array([0.98257501, 0.97560976, 0.9686274 , 0.96515299, 0.96848485])

In [24]:
X, y = get_X_y(no, resample_interval="500ms", session_name=session_name)
fit_model(X, y, clone(pipe))

array([0.97908773, 0.95464877, 0.96167061, 0.94416886, 0.72590618])

In [25]:
X, y = get_X_y(one, resample_interval="500ms", session_name=session_name)
fit_model(X, y, clone(pipe))

array([0.33410673, 0.25846702, 0.33333333, 0.3547387 , 0.44434481])

#### Test2

In [29]:
session_name = "day5-test2"
df = load_spike_proba(experiment="pfc", session_names=[session_name])
df = df.dropna()
exp, one, no = split_by_group(df)

In [62]:
clf = LogisticRegression(C=0.8, penalty="elasticnet", solver='saga', l1_ratio=0.5)
pipe = Pipeline([
    ("clf", clf)
])

In [63]:
X, y = get_X_y(exp, resample_interval="500ms", session_name=session_name)
fit_model(X, y, clone(pipe))

array([0.75432045, 0.70347275, 0.76461876, 0.74102527, 0.6492519 ])

In [64]:
X, y = get_X_y(no, resample_interval="500ms", session_name=session_name)
fit_model(X, y, clone(pipe))

array([0.97210884, 0.9721251 , 0.958147  , 0.94764366, 0.94744386])

In [65]:
X, y = get_X_y(one, resample_interval="500ms", session_name=session_name)
fit_model(X, y, clone(pipe))



array([0.46982759, 0.59618686, 0.54690081, 0.65721005, 0.65731579])

# One model per Mouse

In [7]:
from sklearn.model_selection import TimeSeriesSplit

def get_X_y(df, resample_interval, session_name, time_col="time", value_col="value", cell_col="cell_id", dropna=True):
    X = pivot(df, cell_col=cell_col, value_col="value", )
    X = downsample(X.reset_index(), time_col=time_col, new_interval=resample_interval)
    X = X.dropna()
    y = block_from_time(df=X, session_name=session_name)
    X = X.set_index("time")
    X, y = remove_nan_ys(X, y)
    X, y = remove_mixed_ys(X, y)
    return X, y

def fit_model(X, y, pipe):
#     X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False)
#     pipe.fit(X_train, y_train)
#     y_hat = pipe.predict(X_test)
    test_size = int(len(X) * 0.75)
    cv = TimeSeriesSplit(test_size=0.7, n_splits=5)
    score = cross_val_score(pipe, X, y, scoring='f1_macro')
    return score.mean(), pipe


def fit_model_per_mouse(
    pipe, 
    traces_dict, 
    session_name, 
    resample_interval="500ms", 
    time_col='time',
    value_col='value',
    cell_col='cell_id',
):
    all_scores = []
    all_groups = []
    all_mice = []
    all_models = []
    all_data_sizes = []
    all_cell_sizes = []
    for group in traces_dict.keys():
#         print(f"\nGroup: {group}")
        mice = list(traces_dict[group].keys())
        for mouse in mice:
            t = traces_dict[group][mouse]
            X, y = get_X_y(
                t, 
                resample_interval=resample_interval, 
                session_name=session_name, 
                time_col=time_col,
                cell_col=cell_col,
                value_col=value_col,
            )
            try:
                score, model = fit_model(X, y, clone(pipe))
            except ValueError as e:
                print(str(e))
                score, model = np.nan, np.nan
            all_scores.append(score)
            all_models.append(model)
            all_mice.append(mouse)
            all_groups.append(group)
            all_data_sizes.append(len(X))
            all_cell_sizes.append(X.shape[1])
    return pd.DataFrame({"mouse": all_mice, "group": all_groups, "score": all_scores, 
                        "num_cells": all_cell_sizes, "num_datapoints": all_data_sizes, "model": all_models,})
            


In [8]:
from context_analysis.reshape import split_by_group, split_by_mouse
import warnings
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [9]:
session_name = "day4-test1"
traces_test1 = load_spike_proba(experiment="pfc", session_names=[session_name])
traces_by_mouse_test1 = split_by_mouse(experiment="pfc", df=traces_test1)

In [10]:
# clf = LogisticRegression(C=0.01, penalty="elasticnet", solver='saga', l1_ratio=0.3)
clf = RandomForestClassifier(n_estimators=250, n_jobs=-1)
# clf = DecisionTreeClassifier(max_depth=8)
# clf = SVC(kernel="linear")
pipe = Pipeline([
    ("clf", clf)
])

In [11]:
resample_interval = "500ms"
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    res_test1 = fit_model_per_mouse(
        pipe=clone(pipe), 
        traces_dict=traces_by_mouse_test1, 
        resample_interval=resample_interval,
        session_name=session_name
    )

# df = pd.concat([res_test1.assign(session="test1"), res_test2.assign(session="test2")])

Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.


In [12]:
res_test1

Unnamed: 0,mouse,group,score,num_cells,num_datapoints,model
0,B51628,Experimental,0.635699,152,1434,"(RandomForestClassifier(n_estimators=250, n_jo..."
1,B51621,Experimental,0.608793,41,1434,"(RandomForestClassifier(n_estimators=250, n_jo..."
2,B51620,Experimental,0.613907,8,1434,"(RandomForestClassifier(n_estimators=250, n_jo..."
3,B51618,Experimental,0.974195,188,1434,"(RandomForestClassifier(n_estimators=250, n_jo..."
4,B51619,Experimental,,0,0,
5,B51622,No Shock,0.637558,38,1434,"(RandomForestClassifier(n_estimators=250, n_jo..."
6,B58216,No Shock,0.874248,55,1434,"(RandomForestClassifier(n_estimators=250, n_jo..."
7,B58217,No Shock,0.984656,37,1434,"(RandomForestClassifier(n_estimators=250, n_jo..."
8,B58215,No Shock,0.822272,439,1434,"(RandomForestClassifier(n_estimators=250, n_jo..."
9,B51629,One Context,0.581684,68,1434,"(RandomForestClassifier(n_estimators=250, n_jo..."


In [13]:
session_name = "day5-test2"
traces_test1 = load_spike_proba(experiment="pfc", session_names=[session_name])
traces_by_mouse_test1 = split_by_mouse(experiment="pfc", df=traces_test1)

# resample_interval = "1s"
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    res_test2 = fit_model_per_mouse(
        pipe=clone(pipe), 
        traces_dict=traces_by_mouse_test1, 
        resample_interval=resample_interval,
        session_name=session_name
    )

Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.


In [14]:
res_test2

Unnamed: 0,mouse,group,score,num_cells,num_datapoints,model
0,B51628,Experimental,0.710134,171,1434,"(RandomForestClassifier(n_estimators=250, n_jo..."
1,B51621,Experimental,0.666643,40,1434,"(RandomForestClassifier(n_estimators=250, n_jo..."
2,B51620,Experimental,0.667499,15,1434,"(RandomForestClassifier(n_estimators=250, n_jo..."
3,B51618,Experimental,,0,0,
4,B51619,Experimental,0.766496,17,1434,"(RandomForestClassifier(n_estimators=250, n_jo..."
5,B51622,No Shock,0.61024,37,1434,"(RandomForestClassifier(n_estimators=250, n_jo..."
6,B58216,No Shock,0.91596,66,1434,"(RandomForestClassifier(n_estimators=250, n_jo..."
7,B58217,No Shock,0.994418,22,1434,"(RandomForestClassifier(n_estimators=250, n_jo..."
8,B58215,No Shock,0.874532,442,1434,"(RandomForestClassifier(n_estimators=250, n_jo..."
9,B51629,One Context,0.549931,75,1434,"(RandomForestClassifier(n_estimators=250, n_jo..."


In [15]:
df

NameError: name 'df' is not defined