# Dewan Lab Neuron Pseudopopulation Analysis
## Import Dependencies

In [None]:
import itertools
import os

import numpy.random

os.environ['ISX'] = '0'

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats

from dewan_calcium import classifiers, plotting
from dewan_calcium import stats as dewan_stats
from dewan_calcium.helpers import IO, trace_tools
from dewan_calcium.helpers.project_folder import ProjectFolder

pd.options.mode.copy_on_write = "warn"

print('Finished importing required libraries!')

## Load Data from Project Folder

In [None]:
# Create Project Folder to Gather and Hold all the File Paths
project_folder = ProjectFolder('ODOR', project_dir='/mnt/r2d2/2_Inscopix/1_DTT/5_Combined/ID/VGLUT_Comb/', combined=True)

In [None]:
# If this is the first time the project folder has been created,
# move the files to the appropriate directories and then run this cell, otherwise skip this cell
project_folder.get_data()

## Configs


In [None]:
ANALYSIS_VARIABLE = 'odor' # Either 'odor', 'class' or 'block'

CELL_CLASS = 'vglut' # vglut or vgat

MAX_NUM_CELLS = 300
ENTROPY = None

WINDOW = 2 # Window size for the moving-window SVM decoder; set to None for no window and to consider all data at once
NUM_SVM_SPLITS = 20  # Number of random test-train splits to run and average per SVM run

# Values used in the combine-data.py standalone script to define the sizes of the different data periods
BASELINE_FRAMES = 20
ODOR_FRAMES = 20
POST_FRAMES = 20

SHOW_FIGURES = False

## Constants

In [None]:
## ============================ CONSTANTS ============================ ##
VARS = ['odor', 'class', 'block']
CLASSES = ['vglut', 'vgat', 'oxtr']


ID_ORDER = {
    '4ATE' : 'ATE', '5ATE' : 'ATE', '6ATE' : 'ATE', '7ATE' : 'ATE',
    '4AL' : 'AL', '5AL' : 'AL', '6AL' : 'AL', '7AL' : 'AL',
    '4AMINE' : 'AMINE', '5AMINE' : 'AMINE', '6AMINE' : 'AMINE', '7AMINE' : 'AMINE',
    '4OL' : 'OL', '5OL' : 'OL', '6OL' : 'OL', '7OL' : 'OL',
    '4ONE' : 'ONE', '5ONE' : 'ONE', '6ONE' : 'ONE', '7ONE' : 'ONE',
}

ODOR_BASE = ['ATE', 'AL', 'AMINE', 'OL', 'ONE']

LEVEL_MAP = {
    'class': (1,2), # Level = 3 if we want to only look at class; drops odors and blocks
    'odor': (2,3), # Level = 2 if we want to only look at odors; drops block and class labels
    'block': (1,3) # level = 1 if we want to only look at blocks; drops odor and class labels
}

CM_WINDOWS = {
   'Baseline': (0,20),
   'Odor_Period': (20,40),
   'Latent_Period': (40,60),
   'Odor_and_Latent': (20,60)
}

## VALIDATE INPUTS

IO.verify_input('ANALYSIS_VARIABLE', ANALYSIS_VARIABLE, [str], allowed_values=VARS)
IO.verify_input('CELL_CLASS', CELL_CLASS, [str], allowed_values=CLASSES)
IO.verify_input('WINDOW', WINDOW, [int], allowed_range=(1, 20))
IO.verify_input('NUM_SVM_SPLITS', NUM_SVM_SPLITS, [int], allowed_range=(1, 100), inclusive=True)

## Select Labels

if ANALYSIS_VARIABLE == 'odor':
    _exp_type = 'ID'
    _labels=list(ID_ORDER.keys())
elif ANALYSIS_VARIABLE == 'class':
     _exp_type = 'ID-Class'
     _labels=ODOR_BASE
else: # Blocks
    _exp_type = 'ID-Blocks'
    _labels=[1, 2, 3]

_classes = ID_ORDER

## Check that the data file exists
data_file = []
sig_table = []
if project_folder.raw_data_dir.combined_data_path:
    if CELL_CLASS.lower() in str(project_folder.raw_data_dir.combined_data_path).lower():
        data_file = project_folder.raw_data_dir.combined_data_path[0]
else:
    raise FileExistsError(f'No data file with class {CELL_CLASS} exists!')

if project_folder.raw_data_dir.combined_sig_table_path:
    if CELL_CLASS.lower() in str(project_folder.raw_data_dir.combined_sig_table_path).lower():
        sig_table = project_folder.raw_data_dir.combined_sig_table_path[0]
else:
    raise FileExistsError(f'No significance table with class {CELL_CLASS} exists!')

## Load and Normalize Data

In [None]:
# If desired, N number of cells can be dropped so one dataset can match another in number of cells.
# Set MAX_NUM_CELLS to the upper limit, and X cells (num_cells - MAX_NUM_CELLS) will be chosen at random to be dropped from our current dataset
# If desired, the entropy value used to construct the rng will be printed, and can be saved/reused to have the exact same cells dropped on each run

combined_data = pd.read_pickle(data_file, compression={'method': 'xz'})
if MAX_NUM_CELLS > 0:
    cells = combined_data.columns.get_level_values(0).unique()
    num_cells = len(cells)
    num_cells_to_drop = num_cells - MAX_NUM_CELLS
    if num_cells_to_drop > 0:
        seed_sequence = np.random.SeedSequence(entropy=ENTROPY)
        print(f'Last Entropy Value was: {seed_sequence.entropy}')
        rng = np.random.default_rng(seed_sequence)
        random_cells = rng.choice(cells, num_cells_to_drop, replace=False)
        combined_data = combined_data.T.drop(random_cells).T
    else:
        raise ValueError(f'{MAX_NUM_CELLS} is too large! You cannot drop all cells, or more cells than exist in the dataset')

In [None]:
dff_combined = trace_tools.dff(combined_data, BASELINE_FRAMES)

## Add the class for each odor to the column
cells = np.unique(combined_data.columns.get_level_values(0).values)
dff_combined, original_columns = classifiers.add_odor_class(dff_combined, _classes)

In [None]:
# Run this cell to reset the dataframe to its original configuration
# dff_combined.columns = original_columns

## SVM Classifier


### Sliding Window Decoding Directories

In [None]:
## Load/Create output directories

svm_output_dir = project_folder.analysis_dir.output_dir.subdir('SVM')
svm_fig_dir = project_folder.analysis_dir.figures_dir.subdir('SVM')

if WINDOW:
    svm_output_dir = svm_output_dir.joinpath(f'Window-{WINDOW}')
    svm_fig_dir = svm_fig_dir.joinpath(f'Window-{WINDOW}')

svm_output_dir = svm_output_dir.joinpath(ANALYSIS_VARIABLE)
svm_fig_dir = svm_fig_dir.joinpath(ANALYSIS_VARIABLE)
svm_output_dir.mkdir(parents=True, exist_ok=True)
svm_fig_dir.mkdir(parents=True, exist_ok=True)

cm_data_save_dir = svm_output_dir.joinpath('CM')
cm_figure_save_dir = svm_fig_dir.joinpath('CM')
cm_data_save_dir.mkdir(parents=True, exist_ok=True)
cm_figure_save_dir.mkdir(parents=True, exist_ok=True)

## Sliding Window Decoding

In [None]:
# Drop any non-analysis labels
dff_combined.columns = dff_combined.columns.droplevel(LEVEL_MAP[ANALYSIS_VARIABLE])

# Run SVM
mean_svm_scores, splits_v_repeat_df, all_confusion_mats, (true_labels, pred_labels) = classifiers.sliding_window_ensemble_decoding(dff_combined, window_size=WINDOW, num_splits=NUM_SVM_SPLITS, class_labels=_labels)

# Postprocess data
mean_score_df = classifiers.postprocess(mean_svm_scores, len(cells), WINDOW)

# Save data
IO.save_SVM_output(svm_output_dir, mean_score_df, mean_svm_scores, splits_v_repeat_df, all_confusion_mats, true_labels, pred_labels)
### Shuffled Sliding Window Decoding

# Shuffle trial labels
shuffled_data = classifiers.shuffle_data(dff_combined)

# Run SVM
shuffled_mean_svm_scores, shuffled_splits_v_repeat_df, shuffled_all_confusion_mats, (shuffled_true_labels, shuffled_pred_labels) = classifiers.sliding_window_ensemble_decoding(shuffled_data, window_size=WINDOW, num_splits=NUM_SVM_SPLITS, class_labels=_labels)

# Postprocess data
shuffled_mean_score_df = classifiers.postprocess(shuffled_mean_svm_scores, len(cells), WINDOW)

# Save data
IO.save_SVM_output(svm_output_dir, shuffled_mean_score_df, shuffled_mean_svm_scores, shuffled_splits_v_repeat_df, shuffled_all_confusion_mats, shuffled_true_labels, shuffled_pred_labels, shuffle=True)

## Checkpoint: Load SVM Output

In [None]:
input_dir = project_folder.analysis_dir.output_dir.subdir('SVM')
if WINDOW:
    input_dir = input_dir.joinpath(f'Window-{WINDOW}')

input_dir = input_dir.joinpath(ANALYSIS_VARIABLE)

SVM_data, shuffled_SVM_data = IO.load_SVM_data(input_dir, ANALYSIS_VARIABLE, WINDOW)

mean_svm_scores, splits_v_repeat_df, all_confusion_mats, true_labels, pred_labels = SVM_data

shuffled_mean_svm_scores, shuffled_splits_v_repeat_df, shuffled_all_confusion_mats, shuffled_true_labels, shuffled_pred_labels = shuffled_SVM_data

## Load/Create output directories
svm_output_dir = project_folder.analysis_dir.output_dir.subdir('SVM')
svm_fig_dir = project_folder.analysis_dir.figures_dir.subdir('SVM')

if WINDOW:
    svm_output_dir = svm_output_dir.joinpath(f'Window-{WINDOW}')
    svm_fig_dir = svm_fig_dir.joinpath(f'Window-{WINDOW}')

svm_output_dir = svm_output_dir.joinpath(ANALYSIS_VARIABLE)
svm_fig_dir = svm_fig_dir.joinpath(ANALYSIS_VARIABLE)
cm_data_save_dir = svm_output_dir.joinpath('CM')
cm_figure_save_dir = svm_fig_dir.joinpath('CM')

## Save and Plot SVM Performance

In [None]:
# Preprocess SVM Data
mean_performance, CI_min, CI_max = classifiers.preprocess_for_plotting(mean_svm_scores, splits_v_repeat_df)
shuffle_mean_performance, shuffle_CI_min, shuffle_CI_max = classifiers.preprocess_for_plotting(shuffled_mean_svm_scores, shuffled_splits_v_repeat_df)

CI = (CI_min, CI_max)
shuffle_CI = (shuffle_CI_min, shuffle_CI_max)

# Save SVM Performance Data
_index = [(int(item[0]), int(item[1])) for item in list(mean_svm_scores.keys())]
classifiers.save_svm_data(mean_performance, shuffle_mean_performance, _index, CI, shuffle_CI, svm_output_dir)

In [None]:
# Plot SVM Performance
descriptors = (_exp_type, CELL_CLASS, ANALYSIS_VARIABLE, len(cells))

svm_fig = plotting.plot_svm_performance(mean_performance, shuffle_mean_performance, CI, shuffle_CI, descriptors, svm_fig_dir)

if SHOW_FIGURES:
    plt.show()
else:
    plt.close()

In [None]:
# Plot Average CM

windows = list(all_confusion_mats.keys())
window_averaged_cms = classifiers.average_CM(all_confusion_mats, windows)
# Plot and Save average confusion matrices per window
for window_name in CM_WINDOWS:
    cm_window = CM_WINDOWS[window_name]

    _fig, _ax = classifiers.save_and_plot_CM(window_averaged_cms, cm_window, window_name, windows, _labels, cm_data_save_dir, cm_figure_save_dir)

    if SHOW_FIGURES:
        plt.show()
    else:
        plt.close


## Population and Lifetime Sparseness


### Calculate Cell-Odor Means

In [None]:
if ANALYSIS_VARIABLE != 'odor':
    raise ValueError('Population and Lifetime sparseness can only be run if analyzing odorants!')

# Pop Sparseness -> per odor
# Lifetime Sparseness -> per cell
dff_combined.columns = original_columns

transposed_data = dff_combined.T

cell_medians = {}
cell_means = {}
nonzero_cell_medians = {}

cells = transposed_data.groupby('Cells')
cell_odor_orders = []

for cell_name, cell_df in cells:
    medians = []
    means = []
    nonzero_medians = []
    odor_order = []
    odors = cell_df.groupby('Odor')

    for name, odor_data in odors:
        baseline_mean = odor_data.iloc[:, :BASELINE_FRAMES].mean(axis=1) # baseline means for each trial
        odor_trial_means = odor_data.iloc[:, BASELINE_FRAMES: BASELINE_FRAMES + ODOR_FRAMES].mean(axis=1) # odor evoked means for each trial
        diff = odor_trial_means.subtract(baseline_mean, axis=0)  # subtract the baseline from odor activity
        _mean = diff.mean()
        _median = diff.median()

        nonzero_medians.append(_median)

        # What happens if you zero the values BEFORE taking the mean?
        if _mean < 0:
            _mean = 0
        if _median < 0:
            _median = 0

        means.append(_mean)
        medians.append(_median)
        odor_order.append(name)

    cell_means[cell_name] = means
    cell_medians[cell_name] = medians
    nonzero_cell_medians[cell_name] = nonzero_medians
    cell_odor_orders.append(odor_order)

odors = cell_odor_orders[0]

cell_means = pd.DataFrame(cell_means, index=odors)
cell_medians = pd.DataFrame(cell_medians, index=odors)
nonzero_cell_medians = pd.DataFrame(nonzero_cell_medians, index=odors)

## Population Sparseness

In [None]:
if ANALYSIS_VARIABLE != 'odor':
    raise ValueError('Population and Lifetime sparseness can only be run if analyzing odorants!')

pop_sparseness_values_means = {}
pop_sparseness_values_medians = {}

for odor_name, odor_data in cell_means.iterrows():
    num_cells = odor_data.shape[0]
    odor_data = odor_data.values
    sparseness_value = dewan_stats.sparseness(num_cells, odor_data)
    pop_sparseness_values_means[odor_name] = sparseness_value

for odor_name, odor_data in cell_medians.iterrows():
    num_cells = odor_data.shape[0]
    odor_data = odor_data.values
    sparseness_value = dewan_stats.sparseness(num_cells, odor_data)
    pop_sparseness_values_medians[odor_name] = sparseness_value

pop_sparseness_values_means = pd.DataFrame(pop_sparseness_values_means, index=[0])
pop_sparseness_values_medians = pd.DataFrame(pop_sparseness_values_medians, index=[0])

## Lifetime Sparseness

In [None]:
if ANALYSIS_VARIABLE != 'odor':
    raise ValueError('Population and Lifetime sparseness can only be run if analyzing odorants!')


lifetime_sparseness_values_means = {}
lifetime_sparseness_values_medians = {}

for cell_name, cell_data in cell_means.items():
    num_odors = cell_data.shape[0]
    cell_data = cell_data.values
    sparseness_values = dewan_stats.sparseness(num_odors, cell_data)
    lifetime_sparseness_values_means[cell_name] = sparseness_values

for cell_name, cell_data in cell_medians.items():
    num_odors = cell_data.shape[0]
    cell_data = cell_data.values
    sparseness_values = dewan_stats.sparseness(num_odors, cell_data)
    lifetime_sparseness_values_medians[cell_name] = sparseness_values

lifetime_sparseness_values_means = pd.DataFrame(lifetime_sparseness_values_means, index=[0])
lifetime_sparseness_values_medians = pd.DataFrame(lifetime_sparseness_values_medians, index=[0])

## Write to File

In [None]:
if ANALYSIS_VARIABLE != 'odor':
    raise ValueError('Population and Lifetime sparseness can only be run if analyzing odorants!')

sparseness_path = project_folder.analysis_dir.output_dir.path.joinpath('sparseness.xlsx')

with pd.ExcelWriter(sparseness_path, engine='xlsxwriter') as writer:
    pop_sparseness_values_medians.to_excel(writer, sheet_name='Population Sparseness (Medians)')
    pop_sparseness_values_means.to_excel(writer, sheet_name='Population Sparseness (Means)')
    lifetime_sparseness_values_medians.to_excel(writer, sheet_name='Lifetime Sparseness (Medians)', index=[0])
    lifetime_sparseness_values_means.to_excel(writer, sheet_name='Lifetime Sparseness (Means)', index=[0])
    cell_means.to_excel(writer, sheet_name='Cell Means (Zeroed)')
    cell_medians.to_excel(writer, sheet_name='Cell Medians (Zeroed)')

## Correlations

In [None]:
if ANALYSIS_VARIABLE != 'odor':
    raise ValueError('Correlations can only be run if analyzing odorants!')


odors = transposed_data.index.get_level_values(1).unique()
odors = np.sort(odors)
perms = list(itertools.permutations(odors, 2))

correlations = pd.DataFrame(dtype=float, index=odors, columns=odors)  # Explicitly set dtype to float

for odor1, odor2 in perms:
    odor1_means = cell_means.loc[odor1]
    odor2_means = cell_means.loc[odor2]

    pearson_result = stats.pearsonr(odor1_means, odor2_means)

    correlations.loc[odor1, odor2] = pearson_result.statistic

correlations = correlations.fillna(1.0)
correlations = 1 - correlations

sorted_correlations = correlations.loc[_labels, _labels]

correlation_path = project_folder.analysis_dir.output_dir.path.joinpath('correlations.xlsx')
sorted_correlations.to_excel(correlation_path)

## Reorganize Cell Significance Matrix

## Sorting Rules
1) Non-responsive - zeros only
2) Excitatory on responses - 2s only (sort by the most 2s)
3) Excitatory off responses - 4s only (sort by the most 4s)
4) Excitatory combo responses- 2s and 4s (sort by the most 2s+4s)
5) Inhibitory on responses - 1s only (sort by the most 1s)
6) Inhibitory off responses - 3s only (sort by the most 3s)
7) Inhibitory combo responses (1s and 3s sort by the most 1s+3s)
8) Combo responses (any combination of 1/3 and 2/4 sort by the most responses)
9) Buzzer - any cell that responses to the buzzer (sort by the total number of responses of any number)
10) MO - any cell that responses to MO (sort by the total number of responses of any number)


In [None]:
def sort_by_response_number(df: pd.DataFrame):
    order = df.T.ne(0).sum().sort_values(ascending=False).index
    sorted_df = df.loc[order]
    return sorted_df

def get_and_sort_cells(odor_responsive_cells, IDs):
    ID_mask = np.all(odor_responsive_cells.isin(IDs), axis=1)
    ID_cells = odor_responsive_cells.loc[ID_mask]
    sorted_ID_cells = sort_by_response_number(ID_cells)
    return sorted_ID_cells

combined_sig_table = pd.read_excel(sig_table, index_col=0)

nonresponsive_cells_mask = (combined_sig_table.sum(axis=1) == 0)
nonresponsive_cells = combined_sig_table.loc[nonresponsive_cells_mask]

responsive_cells_mask = np.logical_not(nonresponsive_cells_mask)
responsive_cells = combined_sig_table.loc[responsive_cells_mask]

buzzer_mask = (responsive_cells['Buzzer'] != 0)
buzzer_cells = responsive_cells.loc[buzzer_mask]
buzzer_cells = sort_by_response_number(buzzer_cells)

non_buzzer_mask = np.logical_not(buzzer_mask)
non_buzzer_cells = responsive_cells.loc[non_buzzer_mask]

MO_mask = (non_buzzer_cells['MO'] != 0)
MO_cells = non_buzzer_cells.loc[MO_mask]
MO_cells = sort_by_response_number(MO_cells)

non_MO_mask = np.logical_not(MO_mask)
odor_responsive_cells = non_buzzer_cells.loc[non_MO_mask]

excitatory_on_cells = get_and_sort_cells(odor_responsive_cells, [0, 2])   # 0 and 2 ONLY
odor_responsive_cells = odor_responsive_cells.drop(excitatory_on_cells.index)

excitatory_off_cells = get_and_sort_cells(odor_responsive_cells, [0, 4])   # 0 and 4 ONLY
odor_responsive_cells = odor_responsive_cells.drop(excitatory_off_cells.index)

excitatory_combo_cells = get_and_sort_cells(odor_responsive_cells, [0, 2, 4]) # 0, 2, AND 4 ONLY
odor_responsive_cells = odor_responsive_cells.drop(excitatory_combo_cells.index)

inhibitory_on_cells = get_and_sort_cells(odor_responsive_cells, [0, 1])  # 0 and 1 ONLY
odor_responsive_cells = odor_responsive_cells.drop(inhibitory_on_cells.index)
inhibitory_off_cells = get_and_sort_cells(odor_responsive_cells, [0, 3]) # 0 and 3 ONLY
odor_responsive_cells = odor_responsive_cells.drop(inhibitory_off_cells.index)
inhibitory_combo_cells = get_and_sort_cells(odor_responsive_cells, [0, 1, 3]) # 0, 1 AND 3 ONLY
odor_responsive_cells = odor_responsive_cells.drop(inhibitory_combo_cells.index)

any_combo_cells = sort_by_response_number(odor_responsive_cells)

sorted_dataframe = pd.concat([nonresponsive_cells, excitatory_on_cells, excitatory_off_cells, excitatory_combo_cells, inhibitory_on_cells, inhibitory_off_cells, inhibitory_combo_cells, any_combo_cells, buzzer_cells, MO_cells])


_order = np.hstack([_labels, ['Buzzer', 'MO']])
sorted_by_odor = sorted_dataframe[_order]


sorted_sig_table_path = project_folder.analysis_dir.output_dir.path.joinpath('sorted_significance_table.xlsx')
sorted_by_odor.to_excel(sorted_sig_table_path)