# Notebook for construction of derivative dataframes
## - uses basic dataframes (see ia_dataframe_basic.ipynb)

In [24]:
%run ./rob_setup_notebook.ipynb

import utils.utils_funcs as uf
import utils.gsheets_importer as gsi
import utils.ia_funcs as ia 
from utils.paq2py import *

session_type = 'interneuron'
# 'sensory_nodetrend'
# 'sensory_highactivity'
# 'sensory_topactivity'
# 'sensory_topcells'
# 'sensory_extremefilter'
# 'sensory_responsivecells'
# 'sensory_2sec_test'
# 'projection_nodetrend'
# 'projection_2sec_test'
# 'interneuron'

projection = True if 'projection' in session_type else False
interneuron = True if 'interneuron' in session_type else False

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Suite2p path: ['/home/rlees/anaconda3/envs/suite2p/lib/python3.7/site-packages/suite2p']
qnap_path: /home/rlees/mnt/qnap 
qnap_data_path /home/rlees/mnt/qnap/Data 
pkl_folder: /home/rlees/mnt/qnap/pkl_files 
master_path: /home/rlees/mnt/qnap/master_pkl/master_obj.pkl 
fig_save_path: /home/rlees/mnt/qnap/Analysis/Figures 
stam_save_path: /home/rlees/mnt/qnap/Analysis/STA_movies 
s2_borders_path: /home/rlees/mnt/qnap/Analysis/S2_borders


# Define dataframe paths

In [25]:
session_df_path = qnap_path + '/pkl_files/dataframes/' + session_type + '_session_df.pkl'
experiment_df_path = qnap_path + '/pkl_files/dataframes/' + session_type + '_experiment_df.pkl'
trial_df_path = qnap_path + '/pkl_files/dataframes/' + session_type + '_trial_df.pkl'
cell_df_path = qnap_path + '/pkl_files/dataframes/' + session_type + '_cell_df.pkl'
cell_trial_df_path = qnap_path + '/pkl_files/dataframes/' + session_type + '_cell_trial_df.pkl'
cell_mean_timepoint_df_path = qnap_path + '/pkl_files/dataframes/' + session_type + '_cell_mean_timepoint_df.pkl'
cell_trial_single_timepoint_df_path = (qnap_path + '/pkl_files/dataframes/' + session_type + 
                                       '_cell_trial_single_timepoint_df.pkl')

# Load common dataframes

In [26]:
experiment_df = pd.read_pickle(experiment_df_path)
cell_trial_df = pd.read_pickle(cell_trial_df_path)
cell_df = pd.read_pickle(cell_df_path)
cell_mean_timepoint_df = pd.read_pickle(cell_mean_timepoint_df_path)
cell_trial_single_timepoint_df = pd.read_pickle(cell_trial_single_timepoint_df_path)

# Find extreme responses from cells

In [None]:
# Find and save extreme cells 

max_df = cell_trial_single_timepoint_df.groupby('cell_id').max()
# max_df = cell_mean_timepoint_df.groupby('cell_id').max()

# baseline_df = cell_mean_timepoint_df.query('timepoint < 0')
# max_df = baseline_df.groupby('cell_id').max()

threshold = 10

if projection or interneuron:
    data = max_df[(np.absolute(max_df['pr_resp']) > threshold) | \
                  (np.absolute(max_df['ps_resp']) > threshold) | \
                  (np.absolute(max_df['spont_resp']) > threshold)
                 ]
else:
    data = max_df[(np.absolute(max_df['pr_resp']) > threshold) | \
                  (np.absolute(max_df['ps_resp']) > threshold) | \
                  (np.absolute(max_df['spont_resp']) > threshold) | \
                  (np.absolute(max_df['whisker_resp']) > threshold)
                 ]

extreme_cells = data.index.to_numpy()

np.save(qnap_path + '/pkl_files/dataframes/' + session_type + '_extreme_cell_ids.npy', extreme_cells)

extreme_cells = np.load(qnap_path + '/pkl_files/dataframes/' + session_type + '_extreme_cell_ids.npy', allow_pickle=True)

# Plot extreme cells

# df_ids = cell_mean_timepoint_df['cell_id'].isin(extreme_cells)
# data = cell_mean_timepoint_df[df_ids]

# for trial in ['pr', 'ps', 'spont', 'whisker']:
#     sns.relplot(kind='line', y=trial + '_sta_resp', x='timepoint', col='cell_id', col_wrap=4, data=data, legend=False);

In [28]:
def removeExtremeCells(df, extreme_cells):
    start_len = len(df)
    
    index = df.index.name
    if index:    
        df.reset_index(drop=False, inplace=True)
    
    extreme_cell_ids = df['cell_id'].isin(extreme_cells)
    df = df[~extreme_cell_ids]
    
    if index:
        df.set_index(index, inplace=True)
    
    end_len = len(df)
    print(start_len - end_len, 'rows removed')
    
    return df

In [29]:
removeExtremeCells(cell_df, extreme_cells).to_pickle(cell_df_path)
removeExtremeCells(cell_trial_df, extreme_cells).to_pickle(cell_trial_df_path)
removeExtremeCells(cell_mean_timepoint_df, extreme_cells).to_pickle(cell_mean_timepoint_df_path)
removeExtremeCells(cell_trial_single_timepoint_df, extreme_cells).to_pickle(cell_trial_single_timepoint_df_path)

923 rows removed
92300 rows removed
167986 rows removed
16798600 rows removed


# Calculating influence of photostim for each cell (Chettih + Harvey style)

1. Doing photostim response probability minus spont response probability is convoluted behind the calculation of responsivity on single trials, which is if the 500 ms average of post-stim response was >1 SD of the baseline, why not just use the raw dFF values to calculate influence?
2. I was going to do photostim average response minus spont average response for each cell as an 'influence' value
3. Chettih and Harvey do single trial photostim minus average of all spont trials, then normalise the difference by the standard deviation of all differences for that cell
    - I think this accounts for the reliability of the influence for a single cell, i.e. cell with high standard deviation of differences will be penalised (lower influence) over one with small standard deviation
    - Does it matter if the influence is reliable?
    - Potentially normalise by the standard deviation of the baseline period to account for poor estimation of spiking activity, rather than the many bouts of delta Activity

## Photostim site influence

In [30]:
# Get the average spontaneous responses for all neurons from the cell_df

cell_df = pd.read_pickle(cell_df_path)
cell_trial_df = pd.read_pickle(cell_trial_df_path)

cell_id_trial_df = cell_trial_df['cell_id'].reset_index(drop=True).to_numpy()
cell_id_cell_df = cell_df.reset_index(drop=False)['cell_id'].to_numpy()

sorted_ids = np.argsort(cell_id_cell_df)
match_indices = np.searchsorted(cell_id_cell_df[sorted_ids], cell_id_trial_df)
indices = sorted_ids[match_indices]

avg_spont_response = cell_df['spont_sta_amp_resp'].iloc[indices].values

In [31]:
# Append results to cell_df and calculate influence from them

trials = ['pr', 'ps']

for trial in ['pr','ps']:
    
    cell_trial_df['avg_spont_resp'] = avg_spont_response
    cell_trial_df[trial + '_influence'] = cell_trial_df[trial + '_amp_resp'] - cell_trial_df['avg_spont_resp']

    cell_trial_df[trial + '_influence_std'] = np.repeat(cell_trial_df.groupby('cell_id').std()[trial + '_influence'], 100).to_numpy()
    cell_trial_df[trial + '_norm_influence'] = cell_trial_df[trial + '_influence']/cell_trial_df[trial + '_influence_std']
    
    cell_df[trial + '_avg_norm_influence'] = cell_trial_df.groupby('cell_id').mean()[trial + '_norm_influence']

## Control site influence

In [32]:
# Leave-one-out for control trials and recalculate the average and difference of control trial from that average
# Log that value as the influence on that trial

# for each trial out of 100 x
# take that trial (of interest) x
# take the mean of the other 99 x
# presumably now the same length and order x
# subtract the mean of the 99 from the 1 x
# that is the influence of that first control trial, save in array (2D, n x 100) where n = cells x
# once full array, swap dims and reshape so that it goes 1-100 for each cell consecutively, not 1-ncells for each trial

leftout_influence = np.full((100,len(cell_df)), np.nan)

for i, trial in enumerate(np.arange(1, 101)):
    trial_leftout = cell_trial_df.query('trial_num == ' + str(trial))
    remaining_trials = cell_trial_df.query('trial_num != ' + str(trial))

    remaining_mean_resp = remaining_trials.groupby('cell_id')['spont_amp_resp'].mean().to_numpy()
    leftout_influence[i,:] = trial_leftout['spont_amp_resp'].to_numpy() - remaining_mean_resp
    
    print('trial', i, end='\r')

trial 99

In [33]:
cell_trial_df['spont_influence'] = np.swapaxes(leftout_influence, 0, 1).flatten()

cell_trial_df['spont_influence_std'] = np.repeat(cell_trial_df.groupby('cell_id').std()['spont_influence'], 100).to_numpy()
cell_trial_df['spont_norm_influence'] = cell_trial_df['spont_influence']/cell_trial_df['spont_influence_std']

cell_df['spont_avg_norm_influence'] = cell_trial_df.groupby('cell_id').mean()['spont_norm_influence']

cell_df.to_pickle(cell_df_path)
cell_trial_df.to_pickle(cell_trial_df_path)

## No normalisation

In [34]:
cell_df = pd.read_pickle(cell_df_path)
cell_trial_df = pd.read_pickle(cell_trial_df_path)

trials = ['pr', 'ps', 'spont']

for trial in trials:

    cell_df[trial + '_avg_norm_influence'] = cell_trial_df.groupby('cell_id').mean()[trial + '_influence']

cell_df.to_pickle(cell_df_path)

# Find manually annotated CTB targets (only S1 target region was annotated)

In [35]:
if projection:
    
    pr_experiments = experiment_df.index.str.contains('pr')
    tiff_paths = experiment_df[pr_experiments].tiff_path
    session_ids = experiment_df[pr_experiments].session_id

    ctb_df = pd.DataFrame()

    for session_id, tiff_path in zip(session_ids, tiff_paths):

        ctb_annotation = ia.listdirFullpath(tiff_path, 'annotation') 
        ctb_annotation_img = tf.imread(ctb_annotation)

        ctb_annotation_coords = np.where(ctb_annotation_img>0)
        ctb_annotation_img[ctb_annotation_coords] = 1 # [y,x] coords

        n_annotations = len(ctb_annotation_coords[0])

        cell_img = np.zeros_like(ctb_annotation_img, dtype='uint16')

        session_filter = cell_df.session_id.str.contains(session_id)
        session_cell_df = cell_df[session_filter]

        cell_id = session_cell_df.index
        cell_x = session_cell_df.cell_x.to_numpy()
        cell_y = session_cell_df.cell_y.to_numpy()

        for i, coord in enumerate(zip(cell_y, cell_x)):
            cell_img[coord] = i+1

        # binary mask x cell image to get the cells that overlap with target areas
        ctb_cells = cell_img*ctb_annotation_img

        ctb_cells = np.unique(ctb_cells)[1:]-1 # correct the cell id due to zero indexing

        ctb_id = np.zeros_like(cell_id.to_numpy(), dtype='bool')
        ctb_id[ctb_cells] = True

        temp_df = pd.DataFrame(data={'ctb_targets' : ctb_id},
                               index=cell_id)

        ctb_df = pd.concat([ctb_df, temp_df])

    if 'ctb_targets' in cell_df.columns:
        cell_df['ctb_targets'] = ctb_df
    else:
        cell_df.insert(9, 'ctb_targets', ctb_df, allow_duplicates=False)
        
    cell_df.to_pickle(cell_df_path)

# Finding manually annotated interneurons

In [36]:
if interneuron:
    
    pr_experiments = experiment_df.index.str.contains('pr')
    tiff_paths = experiment_df[pr_experiments].tiff_path
    session_ids = experiment_df[pr_experiments].session_id

    int_df = pd.DataFrame()

    for session_id, tiff_path in zip(session_ids, tiff_paths):

        int_annotation = ia.listdirFullpath(tiff_path, 'interneurons.tif') 
        int_annotation_img = tf.imread(int_annotation)

        int_annotation_coords = np.where(int_annotation_img>0)
        int_annotation_img[int_annotation_coords] = 1 # [y,x] coords

        n_annotations = len(int_annotation_coords[0])

        cell_img = np.zeros_like(int_annotation_img, dtype='uint16')

        session_filter = cell_df.session_id.str.contains(session_id)
        session_cell_df = cell_df[session_filter]

        cell_id = session_cell_df.index
        cell_x = session_cell_df.cell_x.to_numpy()
        cell_y = session_cell_df.cell_y.to_numpy()

        for i, coord in enumerate(zip(cell_y, cell_x)):
            cell_img[coord] = i+1

        # binary mask x cell image to get the cells that overlap with target areas
        int_cells = cell_img*int_annotation_img

        int_cells = np.unique(int_cells)[1:]-1 # correct the cell id due to zero indexing

        int_id = np.zeros_like(cell_id.to_numpy(), dtype='bool')
        int_id[int_cells] = True

        temp_df = pd.DataFrame(data={'int_cell' : int_id},
                               index=cell_id)

        int_df = pd.concat([int_df, temp_df])

    if 'int_cell' in cell_df.columns:
        cell_df['int_cell'] = int_df
    else:
        cell_df.insert(9, 'int_cell', int_df, allow_duplicates=False)
        
    cell_df.to_pickle(cell_df_path)

# Calculate nearest neighbour distance to responsive target cell (or any target cell) for each cell

In [37]:
# DISTANCE TO RESPONSIVE TARGETS ONLY (STA_SIG)
if interneuron:
    dist_to_targ = pd.DataFrame(index=cell_df.index, columns=['dist_to_resp_targ'])

    targets = ['pr']
    trials = ['pr']

    for target, trial in zip(targets, trials):
        for name, group in cell_df.groupby('session_id'):
            target_coords = group.query(target + '_target & sta_sig_' + trial)['cell_med'].to_numpy()
            target_coords = np.vstack(target_coords)

            target_query = group[target + '_target']
            cell_meds = group['cell_med']
            cell_ids = group.index

            for i, (target_bool, cell_med, cell_id) in enumerate(zip(target_query, cell_meds, cell_ids)):

                if target_bool == False:
                    dists = spatial.distance.cdist([cell_med], target_coords)
                    min_dist = np.amin(dists)

                    dist_to_targ.loc[cell_id, 'dist_to_resp_targ'] = min_dist

    cell_df = pd.concat([cell_df, dist_to_targ], axis=1) # maybe MERGE here instead?

    cell_df.to_pickle(cell_df_path)

In [38]:
# ANY TARGET (RESPONSIVE OR NOT)
if interneuron:
    dist_to_targ = pd.DataFrame(index=cell_df.index, columns=['dist_to_targ'])

    targets = ['pr', 'pr']
    trials = ['pr', 'spont']

    for target, trial in zip(targets, trials):
        for name, group in cell_df.groupby('session_id'):
            target_coords = group.query(target + '_target')['cell_med'].to_numpy()
            target_coords = np.vstack(target_coords)

            target_query = group[target + '_target']
            cell_meds = group['cell_med']
            cell_ids = group.index

            for i, (target_bool, cell_med, cell_id) in enumerate(zip(target_query, cell_meds, cell_ids)):

                if target_bool == False:
                    dists = spatial.distance.cdist([cell_med], target_coords)
                    min_dist = np.amin(dists)

                    dist_to_targ.loc[cell_id, 'dist_to_targ'] = min_dist

    cell_df = pd.concat([cell_df, dist_to_targ], axis=1) # maybe MERGE here instead?

    cell_df.to_pickle(cell_df_path)

# Constructing high target activity dataframes 

In [4]:
# Get trial indices where trial activity is low (for filtering out to get high trials only)

cell_trial_df = pd.read_pickle(cell_trial_df_path)
cell_df = pd.read_pickle(cell_df_path)

def filter_trial_df(df1, df2, target_string, resp_string):
    cell_ids = df1.query('@target_string').index
    target_filter = df2['cell_id'].isin(cell_ids)
    filtered_df = df2[target_filter]
    
    return df3.groupby(['session_id', 'trial_num']).sum()['@resp_string']
    
pr_target_trial_amp_resp = filter_trial_df(cell_df, cell_trial_df, 'pr_target', 'pr_amp_resp')
ps_target_trial_amp_resp = filter_trial_df(cell_df, cell_trial_df, 'ps_target', 'ps_amp_resp')
sham_target_trial_amp_resp = filter_trial_df(cell_df, cell_trial_df, 'pr_target', 'spont_amp_resp')

sham_indices = sham_target_trial_amp_resp.index.sortlevel()
pr_indices = pr_target_trial_amp_resp.index[pr_target_trial_amp_resp<15].sortlevel()
ps_indices = ps_target_trial_amp_resp.index[ps_target_trial_amp_resp<15].sortlevel()

# KEEP 25 LARGEST (out of 100)

sham_indices = sham_target_trial_amp_resp.groupby('session_id').nsmallest(75, keep='first').index \
                .droplevel(level=0).sortlevel()
pr_indices = pr_target_trial_amp_resp.groupby('session_id').nsmallest(75, keep='first').index \
                .droplevel(level=0).sortlevel()
ps_indices = ps_target_trial_amp_resp.groupby('session_id').nsmallest(75, keep='first').index \
                .droplevel(level=0).sortlevel()

In [6]:
# CELL TRIAL SINGLE TIMEPOINT DF modifications

cell_trial_single_timepoint_df = pd.read_pickle(cell_trial_single_timepoint_df_path)
cell_trial_single_timepoint_df.set_index(['session_id', 'trial_num'], inplace=True)
cell_trial_single_timepoint_df = cell_trial_single_timepoint_df.round({'timepoint' : 2})

In [None]:
# CELL MEAN TIMEPOINT DF creation

for i, (sham_mi, pr_mi, ps_mi) in enumerate(zip(sham_indices[0], pr_indices[0], ps_indices[0])):
    cell_trial_single_timepoint_df.loc[sham_mi, 'spont_resp'] = np.nan
    cell_trial_single_timepoint_df.loc[pr_mi, 'pr_resp'] = np.nan
    cell_trial_single_timepoint_df.loc[ps_mi, 'ps_resp'] = np.nan
    
cell_mean_timepoint_df = cell_trial_single_timepoint_df.groupby(['cell_id', 'timepoint']).mean()

cell_mean_timepoint_df.reset_index(inplace=True, drop=False)

cell_mean_timepoint_df.rename(columns={'pr_resp' : 'pr_sta_resp',
                                       'ps_resp' : 'ps_sta_resp',
                                       'spont_resp' : 'spont_sta_resp',
                                       'whisker_resp' : 'whisker_sta_resp'
                                      }, inplace=True)

cell_mean_timepoint_df['frame'] = np.tile(range(0,182), len(cell_mean_timepoint_df.cell_id.unique()))

cell_mean_timepoint_df['session_id'] = cell_mean_timepoint_df['cell_id'].str[:16]

In [16]:
# CELL DF

cell_df_copy = cell_mean_timepoint_df.query('(timepoint > 0.35) & (timepoint < 0.85)') \
                                     .groupby('cell_id') \
                                     .mean() \
                                     .rename(columns={'pr_sta_resp' : 'pr_sta_amp_resp',
                                                      'ps_sta_resp' : 'ps_sta_amp_resp',
                                                      'spont_sta_resp' : 'spont_sta_amp_resp',
                                                      'whisker_sta_resp' : 'whisker_sta_amp_resp'
                                                     }) \
                                     .iloc[:,-5:]

cell_df['pr_sta_amp_resp'] = cell_df_copy['pr_sta_amp_resp']
cell_df['ps_sta_amp_resp'] = cell_df_copy['ps_sta_amp_resp']
cell_df['spont_sta_amp_resp'] = cell_df_copy['spont_sta_amp_resp']
cell_df['whisker_sta_amp_resp'] = cell_df_copy['whisker_sta_amp_resp']

cell_df['pr_resp_sign'] = np.sign(cell_df['pr_sta_amp_resp']) == 1
cell_df['ps_resp_sign'] = np.sign(cell_df['ps_sta_amp_resp']) == 1
cell_df['spont_resp_sign'] = np.sign(cell_df['spont_sta_amp_resp']) == 1
cell_df['whisker_resp_sign'] = np.sign(cell_df['whisker_sta_amp_resp']) == 1

In [33]:
# CELL TRIAL DF

baseline = cell_trial_single_timepoint_df.query('(timepoint < 0) & (timepoint > -0.5)').groupby(['cell_id', 'trial_num']).mean()
post_stim = cell_trial_single_timepoint_df.query('(timepoint > 0.35) & (timepoint < 0.85)').groupby(['cell_id', 'trial_num']).mean()

amp_resp = post_stim.loc[:, ['pr_resp', 'ps_resp', 'spont_resp', 'whisker_resp']] - baseline.loc[:, ['pr_resp', 'ps_resp', 'spont_resp', 'whisker_resp']]

amp_resp.rename(columns = {'pr_resp' : 'pr_amp_resp',
                           'ps_resp' : 'ps_amp_resp',
                           'spont_resp' : 'spont_amp_resp',
                           'whisker_resp' : 'whisker_amp_resp'
                          }, inplace=True)

cell_trial_df.reset_index(drop=False, inplace=True)
cell_trial_df.set_index(['cell_id', 'trial_num'], inplace=True)

cell_trial_df['pr_amp_resp'] = amp_resp['pr_amp_resp']
cell_trial_df['ps_amp_resp'] = amp_resp['ps_amp_resp']
cell_trial_df['spont_amp_resp'] = amp_resp['spont_amp_resp']
cell_trial_df['whisker_amp_resp'] = amp_resp['whisker_amp_resp']

cell_trial_df.reset_index(drop=False, inplace=True)
cell_trial_df.set_index('cell_trial_id', inplace=True)
cell_trial_df = cell_trial_df.iloc[:, :8]

In [27]:
# SAVING DATAFRAMES (should reset index on some of them?)

session_type = 'sensory_topactivity'

df_list = [cell_df, cell_trial_df, cell_mean_timepoint_df, cell_trial_single_timepoint_df]
df_names = ['_cell_df', '_cell_trial_df', '_cell_mean_timepoint_df', '_cell_trial_single_timepoint_df']

for df, name in zip(df_list, df_names):
    df_name = session_type + name

    # Pickle the object output to save it for analysis
    pkl_path = os.path.join(pkl_folder, 'dataframes', df_name + '.pkl')
    df.to_pickle(pkl_path)

# Constructing 'responsive cells' dataframes (based on whisker response)

In [18]:
def filter_df(df, indices):
    df_filter = df.cell_id.isin(indices)
    return df[df_filter]

In [33]:
w_resp_cells = cell_df.query('sta_sig_whisker | pr_target | ps_target').index

cell_mean_timepoint_df = filter_df(cell_mean_timepoint_df, w_resp_cells)
cell_trial_single_timepoint_df = filter_df(cell_trial_single_timepoint_df, w_resp_cells)
cell_trial_df = filter_df(cell_trial_df, w_resp_cells)

df_filter = cell_df.index.isin(w_resp_cells)
cell_df = cell_df[df_filter]

session_type = 'sensory_responsivecells'

df_list = [cell_df, cell_trial_df, cell_mean_timepoint_df, cell_trial_single_timepoint_df]
df_names = ['_cell_df', '_cell_trial_df', '_cell_mean_timepoint_df', '_cell_trial_single_timepoint_df']

for df, name in zip(df_list, df_names):
    df_name = session_type + name

    # Pickle the object output to save it for analysis
    pkl_path = os.path.join(pkl_folder, 'dataframes', df_name + '.pkl')
    df.to_pickle(pkl_path)

# Constructing top cell dataframes

In [None]:
cell_ids = []

for group_name, group_df in cell_df.groupby('session_id'):
    df_len = len(group_df)
    half_df_len = round(df_len/2)
    
    new_cell_ids = group_df[:half_df_len].index.to_numpy()
    
    cell_ids.extend(new_cell_ids)

In [None]:
session_type = 'sensory_topcells'

df_list = [cell_df, cell_trial_df, cell_mean_timepoint_df, cell_trial_single_timepoint_df]
df_names = ['_cell_df', '_cell_trial_df', '_cell_mean_timepoint_df', '_cell_trial_single_timepoint_df']

for df, name in zip(df_list, df_names):
    if name == '_cell_df':
        df_filter = df.index.isin(cell_ids)  
    else:
        df_filter = df.cell_id.isin(cell_ids)
        
    new_df = df[df_filter]

    df_name = session_type + name

    # Pickle the object output to save it for analysis
    pkl_path = os.path.join(pkl_folder, 'dataframes', df_name + '.pkl')
    new_df.to_pickle(pkl_path)