# Dewan Lab EPM Analysis

## STEP 1: Always Execute! Load Libraries and User Settings
### STEP 1A: Import Libraries

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
os.environ['ISX'] = '0'  # Set to zero so we don't try to load the isx module

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm, trange

from dewan_calcium import plotting, deconv
from dewan_calcium.helpers import IO, parse_json, HFvFM
from dewan_calcium.helpers.project_folder import ProjectFolder

print("Importing required packages complete!")

### STEP 1B: User Configurables

In [None]:
animal = 'ANIMAL_GOES_HERE'
date = 'DATE_GOES_HERE'

HF_first = True

PSEUDOTRIAL_LEN_S = 2
ENDOSCOPE_FRAMERATE = 10
DECAY_TIME_S = 0.4  # Time in seconds for the decay of 10 action potentials (0.4 for GCaMP6f)
RISE_TIME_S = 0.08  # Time in seconds for the rise to peak of 10 action potentials (0.08 for GCaMP6f)

INTER_SPIKE_INTERVAL_S = 0.1 # Time in seconds that must elapse before another "spike"
PEAK_MIN_DUR_S = 0.4  # Time in seconds that must elapse for a "peak" to be considered a "spike"

### STEP 1C: Load Project Folder

In [None]:
# Create Project Folder to Gather and Hold all the File Paths
#test_data = "D:\\Test_Data\\HFvFM"
test_data = "C:\\Projects\\Test_Data\\HFvFM"
project_folder = ProjectFolder('HFvFM', project_dir=test_data)
file_header = animal + '-' + date + '-'

In [None]:
# If this is the first time the project folder has been created,
# move the files to the appropriate directories and then run this cell, otherwise skip this cel
project_folder.get_data()

In [None]:
# Get settings from imaging session and display them for the user

gain, LED_power, focal_planes = parse_json.get_session_settings(project_folder.raw_data_dir.session_json_path)

print(f'Recording Gain: {gain}')
print(f'LED Power: {LED_power}')
print(f'Focal Plane(s): {focal_planes}')

## 2A: Import and pre-process the raw data

In [None]:
#STEP 2A.2: LOAD INSCOPIX DATA

cell_trace_data = pd.read_csv(project_folder.inscopix_dir.cell_trace_path, engine='pyarrow')
GPIO_data = pd.read_csv(project_folder.inscopix_dir.GPIO_path, header=0, engine='pyarrow')
all_cell_props = pd.read_csv(project_folder.inscopix_dir.props_path, header=0, engine='pyarrow')
cell_outlines = parse_json.get_outline_coordinates(project_folder.inscopix_dir.contours_path)

In [None]:
# STEP 2A.3: PREPROCESSING

# STEP 2A.3.1: Drop the first row which contains all 'undecided' labels which is the Inscopix default label.
cell_trace_data = cell_trace_data.drop([0])

# STEP 2A.3.2: Force all dF/F values to be numbers and round times to 2 decimal places
cell_trace_data = cell_trace_data.apply(pd.to_numeric, errors='coerce')

# Set the times as the index so the listed data is all dF/F values
cell_trace_data[cell_trace_data.columns[0]] = cell_trace_data[cell_trace_data.columns[0]].round(2)
cell_trace_data = cell_trace_data.set_index(cell_trace_data.columns[0]) 

# STEP 2A.3.3: Remove spaces from column names and contents
cell_trace_data.columns = cell_trace_data.columns.str.replace(" ", "")
GPIO_data.columns = GPIO_data.columns.str.replace(" ", "")
GPIO_data['ChannelName'] = GPIO_data['ChannelName'].str.replace(" ", "")

# STEP 2A.3.4: Reduce properties to only include the cells with only one component
all_cell_props = all_cell_props[all_cell_props['NumComponents']==1]  # We only want cells that have one component
all_cell_props = all_cell_props.drop(columns='Status').reset_index(drop=True)
cell_names = all_cell_props['Name'].values

# STEP 2A.3.5: PARSE GPIO DATA
sniff_data = GPIO_data[GPIO_data['ChannelName'] == "GPIO-1"].reset_index(drop=True)
FV_data = GPIO_data[GPIO_data['ChannelName'] == "GPIO-2"].reset_index(drop=True)

# OPTIONAL UNUSED DATA
# running_data = GPIO_data[GPIO_data['ChannelName'] == "GPIO-3"]  # Running Wheel Data
# lick_data = GPIO_data[GPIO_data['ChannelName'] == "GPIO-4"]  # Lick Data

## STEP 2B: Manual Curation

In [None]:
from dewan_manual_curation import dewan_manual_curation

curated_cells = dewan_manual_curation.launch_gui(project_folder_override=project_folder, cell_trace_data_override=cell_trace_data, cell_props_override=all_cell_props, cell_contours_override=cell_outlines)
if curated_cells is None:
    print('Error, no good cells selected!')

### STEP 2C: Apply Manual Curation Results and Additional Preprocessing

In [None]:
# STEP 2C.1: Filter all data by the GoodCells identified in ManualCuration

curated_cell_props = all_cell_props[all_cell_props['Name'].isin(curated_cells)].reset_index(drop=True)
curated_trace_data = cell_trace_data[curated_cells]
cell_names = curated_cell_props['Name']

### STEP 2D: Pickle and Save all preprocessed data

In [None]:
# Pickle the reorganized CellTraceData incase its needed later
# Saves Cell Traces, GPIO, Odor List, Sniff, FV data, Good Cell Properties, Good Cells, and the labeled max projection
# Once these have been saved, they don't need to be re-run on the same data again unless the data itself is changed

folder = project_folder.analysis_dir.preprocess_dir.path

IO.save_data_to_disk(curated_trace_data, 'curated_trace_data', file_header, folder)
IO.save_data_to_disk(GPIO_data, 'GPIO_data', file_header, folder)
IO.save_data_to_disk(FV_data, 'FV_data', file_header, folder)
IO.save_data_to_disk(curated_cell_props, 'curated_cell_props', file_header, folder)
IO.save_data_to_disk(sniff_data, 'sniff_table', file_header, folder)

### Checkpoint 1

In [None]:
# Opens the saved pickle files.  If the files have already been saved, code can be re-run
# starting from this point

folder = project_folder.analysis_dir.preprocess_dir.path

curated_trace_data = IO.load_data_from_disk('curated_trace_data', file_header, folder)
GPIO_data = IO.load_data_from_disk('GPIO_data', file_header, folder)
FV_data = IO.load_data_from_disk('FV_data', file_header, folder)
curated_cell_props = IO.load_data_from_disk('curated_cell_props', file_header, folder)

cell_names = curated_cell_props['Name']  # List of cells, referenced periodically

### STEP 4: Isolate dF/F Data for Experiment

In [None]:
# STEP 4A: Parses the final valve data to identify when the final valve is open vs when it is closed based on TTL pulse from Arduino.
# In the EPM experiment, there is no final valve. However, we are using the same sync signal as used in the odor experiments to signal when the LED is triggered

FV_values = FV_data['Value'].astype(float).values # Get FV Values
num_values = len(FV_values)
valve_status = 0
FV_on_indexes = []
FV_off_indexes = []
for i in trange((num_values - 1), desc="Processing: "):
    valve_val_diff = FV_values[i + 1] - FV_values[i]

    if valve_status == 0:    # Start with valve off
        if valve_val_diff > 10000: # If the difference is a very large positive number, the valve opened
            FV_on_indexes.append(i + 1)
            valve_status = 1 # Set valve state to open
    else:
        if valve_val_diff < -10000: # If the difference is a very laarge negative number, the valve closed
            FV_off_indexes.append(i)
            valve_status = 0 # Set valve state to closed

FV_indexes = pd.DataFrame(zip(FV_on_indexes, FV_off_indexes), columns=['On', 'Off'])

In [None]:
# STEP 3B.1: Find trial start and end times with the pre/post trial offsets
time_points = FV_data['Time(s)']

FV_on_times = time_points.iloc[FV_indexes['On']]
FV_off_times = time_points.iloc[FV_indexes['Off']]

trial_times = pd.DataFrame(zip(FV_on_times, FV_off_times), columns=['Start', 'End'])

In [None]:
# STEP 4B: Trim dF/F data to the FV On and Off Times

time_points = curated_trace_data.index.values

cell_trace_start_indices = []
cell_trace_stop_indices = []

for name, (trial_start_time, trial_end_time) in trial_times.iterrows():
    cell_trace_start_indices.append(np.where(time_points <= trial_start_time)[0][-1]) # Find first value less than/= the start time. We would always rather start 1 frame early than late
    cell_trace_stop_indices.append(np.where(time_points >= trial_end_time)[0][0]) # Find the first value greater than/= the end time. We would always rather stop 1 frame late than early

cell_trace_indices = pd.DataFrame(zip(cell_trace_start_indices, cell_trace_stop_indices), columns = ['Start', 'Stop'])

In [None]:
num_trials = trial_times.shape[0]
trial_labels = HFvFM.get_trial_labels(num_trials, HF_first)

FV_indexes.index = trial_labels
trial_times.index = trial_labels
cell_trace_indices.index = trial_labels

In [None]:
# STEP 4A: COMBINE ALL OF THE CELL TRACE DATA INTO A CELL X TRIAL X FRAMES ARRAY
combined_data = []
num_cells = len(cell_names)
for cell in tqdm(cell_names, desc="Cell: "): # Loop through each cell
    cell_data = []
    
    for indices in cell_trace_indices[['Start', 'Stop']].values: # Loop through trials
        start_index, stop_index = indices
        trial_data = curated_trace_data[cell].iloc[start_index:stop_index].reset_index(drop=True)
        cell_data.append(trial_data)
    cell_data = pd.DataFrame(cell_data, index=trial_labels).T  # Transpose dataframe so columns are trials and rows are frames        
    cell_data = cell_data.reset_index(drop=True)    
    combined_data.append(cell_data)
    
# STEP 4B: CROP THE ARRAY TO THE SHORTEST TRIAL TO GET RID OF TRAILING ZEROS
combined_data = pd.concat(combined_data, axis=1, keys=cell_names, names=['Cells', 'Frames'])
# combined_data = combined_data.dropna(axis=0)

In [None]:
# STEP 4C: BASELINE SHIFT THE DATA SO THERE ARE NO NEGATIVE NUMBERS
min_value = abs(combined_data.min().min()) # Get minimum for each row, then the minimum of those values
combined_data_shift = combined_data.add(min_value)

## Step 4D: Save trace data

In [None]:
## Save the paired coordinates - trace data

folder = project_folder.analysis_dir.preprocess_dir.path
IO.save_data_to_disk(combined_data, 'combined_data', file_header, folder)

In [None]:
# STEP 5A: CREATE TABLE OF CONTENTS FOR CELL DESCRIPTORS
column_names = ['Name', 'CentroidX', 'CentroidY', 'NumComponents', 'Size']
toc = curated_cell_props[column_names]
toc = toc.set_index('Name', drop=True)

# STEP 5B: SET FILE PATH AND CREATE EXCEL-SHEET WRITER
file_name = f'{file_header}CombinedData.xlsx'
path = project_folder.analysis_dir.combined_dir.path.joinpath(file_name)
writer = pd.ExcelWriter(path, engine='xlsxwriter')

# STEP 5C: WRITE TABLE OF CONTENTS
toc.to_excel(writer, sheet_name='TOC')

# STEP 5E: WRITE ALL CELL TRACE DATA
for cell in tqdm(cell_names, desc="Writing Cell: "):
    _data = combined_data_shift[cell]
    _data.to_excel(writer, sheet_name=f'Cell {cell}')

writer.close()

## CHECKPOINT 2

In [None]:
folder = project_folder.analysis_dir.preprocess_dir.path
combined_data = IO.load_data_from_disk('combined_data', file_header, folder)
curated_cell_props = IO.load_data_from_disk('curated_cell_props', file_header, folder)
cell_names = curated_cell_props['Name']

## STEP 6: Create 'PSEUDOTRIALS'

In [None]:
smoothing_kernel = deconv.calc_smoothing_params()

smoothed_trace_data = deconv.pooled_deconvolution(combined_data, smoothing_kernel)


In [None]:
# pseudotrials_dff = HFvFM.get_dff_for_pseudotrials(combined_data, cell_names, trial_labels, PSEUDOTRIAL_LEN_S, ENDOSCOPE_FRAMERATE)
# pseudotrials_avg_dff = HFvFM.average_pseudotrials(pseudotrials_dff, cell_names, trial_labels)

## Step 6E: Save PSUEDOTRIALS

In [None]:
folder = project_folder.analysis_dir.output_dir.subdir('pseudotrials')

IO.save_data_to_disk(pseudotrials_dff, 'pseudotrials_dff', file_header, folder)
IO.save_data_to_disk(pseudotrials_avg_dff, 'pseudotrials_avg_dff', file_header, folder)

### Checkpoint 3

In [None]:
folder = project_folder.analysis_dir.output_dir.subdir('pseudotrials')

pseudotrials = IO.load_data_from_disk('pseudotrials', file_header, folder)
trial_stats = IO.load_data_from_disk('trial_stats', file_header, folder)
transitions = IO.load_data_from_disk('transitions', file_header, folder)
arm_indexes = IO.load_data_from_disk('arm_indexes', file_header, folder)
pseudotrial_traces = IO.load_data_from_disk('pseudotrial_traces', file_header, folder)
pseudotrial_means = IO.load_data_from_disk('pseudotrial_means', file_header, folder)

folder = project_folder.analysis_dir.preprocess_dir.path
curated_cell_props = IO.load_data_from_disk('curated_cell_props', file_header, folder)
cell_names = curated_cell_props['Name']

## Step 7A: auROC Analysis

In [None]:
from dewan_calcium import AUROC

groups = (['HF-1', 'HF-2'], ['FM-1', 'FM-2'])
AUROC_results = AUROC.pooled_EPM_auroc(pseudotrial_means, groups, num_workers=20)

## Step 7B: Save auROC output

In [None]:
folder = project_folder.analysis_dir.output_dir.subdir('AUROC')
IO.save_data_to_disk(AUROC_results, 'AUROC_results', file_header, folder)

### Checkpoint 4

In [None]:
folder = project_folder.analysis_dir.output_dir.subdir('AUROC')
AUROC_results = IO.load_data_from_disk('AUROC_results', file_header, folder)

folder = project_folder.analysis_dir.preprocess_dir.path
curated_cell_props = IO.load_data_from_disk('curated_cell_props', file_header, folder)
cell_names = curated_cell_props['Name']
trimmed_trace_data = IO.load_data_from_disk('trimmed_trace_data', file_header, folder)
folder = project_folder.analysis_dir.preprocess_dir.subdir('EPM_ROI')
background_image = IO.load_data_from_disk('background_image', file_header, folder)


## Step 8: Output and Graph Results

In [None]:
## STEP 8A: Output auROC results

auroc_output = []
for data in AUROC_results:
   
    direction_index = round(2 * (data['auroc'] - 0.5), 2)
    auroc = round(data['auroc'], 2)
    bounds = (data['lb'], data['ub'])
    significance = data['significance']

    new_row = [auroc, direction_index, bounds, significance]
    auroc_output.append(new_row)
    
auroc_output = pd.DataFrame(auroc_output, index=cell_names, columns=['auROC', 'direction_index', 'bounds', 'significant'])
folder = project_folder.analysis_dir.output_dir.path
file_name = f'{file_header}EPM_data_output.xlsx'
file_path = folder.joinpath(file_name)
auroc_output.to_excel(file_path)

In [None]:
## STEP 8B: Graph shuffle histograms and auROC histograms

coordinates = trimmed_trace_data['Coordinates'].values
line_coordinates = EPM.generate_position_lines(coordinates)
plotting.plot_EPM_auroc_histograms(AUROC_results, project_folder)
plotting.plot_epm_shuffles(AUROC_results, project_folder)
plotting.plot_animal_track(line_coordinates, background_image, project_folder) 