# Data Grepping Notebook
### Jeremy Delahanty June 2021

Intended to grep different files/projects/datasets from user input and retain them for use in analysis/display later. The lack of unified filenaming structures between projects will break the code... A convetion of XXX### for animal names, or something similar, should be adopted for all animals in the lab.

In [161]:
# Import packages
from pathlib import Path
import pathlib
import glob
import re
import json
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd


In [4]:
lab_basepath = "Y:/"
# project_dict = {"specialk": ["learned_helplessness", "chronic_mild_stress"]}

In [619]:
def grep_teams(team_selection=[], lab_basepath="Y:/"):
    """
    Grabs team list from server based on user's input.
    
    User can define which teams they want to use for their analyses and
    the function will glob the paths for their selection.
    
    Parameters
    ----------
    arg1: list
        List of strings for teams of interest
        Default is empty list
    arg2: string
        Basepath for server location on machine
        Default is Y:/ for mapped Windows drive
    
    
    Returns
    -------
    1. list
        List of team path grabbed from server successfully
    2. list
        List of teams not found
    """

    # Take basepath and glob all available files and directories
    team_glob = Path(lab_basepath).glob("{}".format("*"))
    
    # Check if no team was specifically asked for, tell user we're gathering all teams
    if team_selection == []:

        print("Gathering all teams...")

        # List comprehension for returning all directories in Tye Lab server
        team_list = [team for team in team_glob if team.is_dir()]

    else:
        
        # List comprehension for returning only directories user wants in the Tye Lab server 
        team_list = [team for team in team_glob if team.name in team_selection and team.is_dir()]
    
    # Create temporary list for checking if selected teams exist
    tmp = []

    # For the teams that were globbed successfully, append the team to the temp list
    for globbed_team in team_list:
        tmp.append(globbed_team.name)
    
    # Compare team selection with returned teams using sets, convert to list
    missing_teams = list(set(team_selection) - set(tmp))
    
    # If the missing_teams list is empty, the program found all requested teams
    if missing_teams == []:
        print("Found All Selected Teams")
    
    # Else, some teams weren't found. Tell the user which teams weren't found.
    else:
        print("Failed to find team(s):", missing_teams)
   
    # Show user which teams were returned
    print("Teams Returned:")
    for team in team_list:
        print("{} ".format(team.name))
    
    # Return the list of projects gathered
    return team_list, missing_teams

team_list, missing_teams = grep_teams(["specialk"])

Found All Selected Teams
Teams Returned:
specialk 


In [620]:
def choose_projects(team_list, project_selection={}):
    """
    Generates project paths list based on user's selection.
    
    User can define which project they want to use for their analyses and
    this function generates the paths for their selection.
    
    Parameters
    ----------
    arg1: list
        List of strings for teams of interest from grep_teams()
    arg2: dict
        Dictionary of values that will be used to create specific
        paths for selected teams and their projects
    
    Returns
    -------
    1. list
        List of team/project Paths to grep in next steps
    """

    # Make dictionary using the teams in team_list as keys
    project_dict = {team: [] for team in team_list}

    # For each time in the team_list, append the Path name's project's values
    for team in team_list:
        project_dict[team].append(project_selection[team.name])
    
    # Make empty project list
    project_dir_list = []

    for team in project_dict.keys():
        for project in range(len(project_dict[team][0])):
            project_dir_list.append(team / project_dict[team][0][project])
    
    print("Returned Directories: ")

    for directory in project_dir_list:
        print(directory)
    
    return project_dir_list
    
project_list = choose_projects(team_list, project_selection={"specialk": ["learned_helplessness"]})

Returned Directories: 
Y:\specialk\learned_helplessness


In [623]:
def choose_animals(project_list, animal_group="all"):
    """
    Generates animal paths list based on user's selection.
    
    User can define which cohort of animals they want to use 
    for their analyses. This function generates the paths for 
    their selection that meet specified conditions.
    
    Parameters
    ----------
    arg1: list
        List of strings for projects of interest from choose_projects()
    arg2: str
        String of value for which animal paths to gather.
        Default value is all.
    
    Returns
    -------
    1. list
        List of team/project/animal Paths to grep in next steps
    """
    
    # Create empty animal list for path generation
    animal_list = []
    
    # If the animal group is left as default/specified as all, grab all animals
    if animal_group == "all":
        print("Grabbing all animals...")
        
        # For each project directory in the project list
        for project_dir in project_list:
            
            # For each animal globbed in the project directory
            for animal in project_dir.glob("*"):
                
                # Append the animal's path to the animal_list
                print(project_dir.name, animal.name)
                animal_list.append(animal)
    
    # Else, only select animals from the specified group
    else:
        print("Grabbing only {} animals...".format(animal_group))

        # Format the animal group with the user's input
        animal_group = "[A-Z]{2}" + animal_group + "\d{3}"
        
        # For each project_directory in project_list
        for project_dir in project_list:

            # For each animal globbed in project directory
            for animal in project_dir.glob("*"):
                
                # Use regex to grab only the requested animal
                r = re.search(animal_group, string=animal.name)
                
                # If the search returns None, the animal didn't match the request
                # Skip over it with pass.
                if r is None:
                    pass
                
                # If something is returned, take the match object's value and append
                # the animal to the project directory.
                else:
                    print(project_dir.name, r.group(0))
                    animal_list.append(project_dir / r.group(0))
    
    # Finally, return the list of animals
    return animal_list

animal_list = choose_animals(project_list, animal_group="E")

Grabbing only E animals...
learned_helplessness LHE011
learned_helplessness LHE012
learned_helplessness LHE013
learned_helplessness LHE014
learned_helplessness LHE015
learned_helplessness LHE016


In [624]:
def choose_data(animal_list, data_group=[], verbose=True):
    """
    Generates animal's data paths list based on user's selection.
    
    User can define which dataset to use for the animals they 
    want to use for their analyses. This function generates the
    paths for their selection that meet specified conditions.
    
    Parameters
    ----------
    arg1: list
        List of paths for animals of interest from choose_animals()
    arg2: list
        List of strings for which datasets to gather.
        Default value is all.
    arg3: bool
        Boolean argument for verbose output of paths found or
        not found by the function. Default is True.
    
    Returns
    -------
    1. list
        List of team/project/animal/dataset Paths to grep in
        next steps
    """
    
    # Create empty data list for path generation
    data_list = []
    
    # If data_group is left as default or specified as empty,
    # grab all folders
    if data_group == []:
        print("Grabbing all data folders...")
        
        # For each animal in the animal list
        for animal in animal_list:
            
            # For the data_dir in the globbed animal_path
            for data_dir in animal.glob("*"):
                
                # Append the data_dir to the data_list
                print("Grabbing", animal.name, data_dir.name)
                data_list.append(data)
    
    #TODO: Make verbose into its own function
    elif len(data_group) > 0 and verbose is True:
        print("Grabbing...")
        for data_type in data_group:
            print(data_type, "data")

        print("\nFrom Projects(s)...")
        project_list = list(set([project.parent.name for project in animal_list]))
        for project in project_list:
            print(project)
            
        print("\nIn Team(s)...")
        team_list = list(set([team.parent.parent.name for team in animal_list]))
        for team in team_list:
            print(team)
        print("\nFor Animals...")
        for animal in animal_list:
            print(animal.name)
        
        print("\nChecking for directories...")
        for animal in animal_list:
            for data_type in data_group:
                if (animal / data_type).is_dir():
                    print("Found", animal.name, data_type)
                    data_list.append(animal / data_type)
                else:
                    print("Not Found!", animal.name, data_type)
    else:
       
        #TODO: Write a function for checking
        print("Grabbing specified directories...\n")
        
        for animal in animal_list:
            for data_type in data_group:
                if (animal / data_type).is_dir():
                    data_list.append(animal / data_type)
                else:
                    print(animal.name, data_type, "Not Found!")
    
    # Tell user which directories were returned
    print("\nReturning Directories:")
    for data_dir in data_list:
        print(data_dir)

    return data_list
                

data_list = choose_data(animal_list, data_group=["twop"], verbose=False)

Grabbing specified directories...


Returning Directories:
Y:\specialk\learned_helplessness\LHE011\twop
Y:\specialk\learned_helplessness\LHE012\twop
Y:\specialk\learned_helplessness\LHE013\twop
Y:\specialk\learned_helplessness\LHE014\twop
Y:\specialk\learned_helplessness\LHE015\twop
Y:\specialk\learned_helplessness\LHE016\twop


In [625]:
def grep_twop_behavior_raw(data_list, session_type=[]):

    twop_raw_beh_list = []
    
    
    for directory in data_list:
        all_search = directory.glob("*/*raw_behavior*/*.csv")
        for result in all_search:
            twop_raw_beh_list.append(result)
            
    return twop_raw_beh_list

twop_raw_beh_list = grep_twop_behavior_raw(data_list)

In [626]:
def grep_twop_behavior_config(twop_raw_beh_list):
    
    twop_config_list = []
    
    #TODO: Use .parents instead of parent.parent
    for directory in twop_raw_beh_list:
        search = directory.parent.parent.glob("*.json")
        for result in search:
            twop_config_list.append(result)
            
    return twop_config_list
            
twop_config_list = grep_twop_behavior_config(twop_raw_beh_list)

In [627]:
def align_2p_behavior(twop_raw_beh_list, twop_config_list, twop_microscopy_list=[]):
    
    # TODO: Use Kyle's code to align these files with the relative timestamps of microscopy
    # TODO: Get zipping of these two lists to work so finding config file is already completed...
    # TODO: Make force overwrite/recompile the different datasets
    # TODO: Make verbose version of this function
    needs_alignment = []
    
    aligned_list = []
    
    # First, check for cleaned data
    for directory in twop_raw_beh_list:
        
        # Glob for the aligned json file
        cleaned_check = directory.parents[1].glob("*_aligned.json")
        
        # Make a list using the result of the glob using list comprehension
        clean_checklist = [session for session in cleaned_check]
        
        # If the list is empty, the session needs alignment
        if len(clean_checklist) == 0:
            
            # Show which session needs alignment and append the directory to needs_alignment list
            print("Session Needs Alignment:", directory.parents[3].name, directory.parents[1].name)
            needs_alignment.append(directory)
        
        # Else, the session has already been aligned.  Append the filepath to the aligned_list.
        else:
            aligned_file = clean_checklist[0]
            print("Session already aligned:", aligned_file)
            aligned_list.append(aligned_file)
    
    # Now, do the cleaning
    
    for raw_file in needs_alignment:
        print("Aligning:", raw_file.name)
        
        aligned = {}
        
        parent_folder = raw_file.parents[1]
        
        raw_behavior_df = pd.read_csv(raw_file, index_col="Time(ms)").rename(columns=lambda col:col.strip())
        
        # Any value below 3V is not signal, turn it to zero by filtering
        # values so all that remains are values greater than 3. All else
        # will be 0.
        raw_behavior_df = raw_behavior_df > 3
        
        # Convert all values to int; is necessary for pd.df.diff() to produce
        # negative values used for stop times of each event
        raw_behavior_df = raw_behavior_df.astype(int)
        
        # Take the diff of each column; gives start and stop of each signal
        raw_behavior_df = raw_behavior_df.diff()

        # Replace any NaN values with 0
        raw_behavior_df = raw_behavior_df.fillna(0)
        
        # Grab start and stop values for licks
        aligned["LickOn"] = raw_behavior_df[raw_behavior_df["Lick"] == 1].index.tolist()
        aligned["LickOff"] = raw_behavior_df[raw_behavior_df["Lick"] == -1].index.tolist()
        
        # Grab start and stop values for Airpuff Solenoid
        aligned["AirpuffOn"] = raw_behavior_df[raw_behavior_df["Airpuff"] == 1].index.tolist()
        aligned["AirpuffOff"] = raw_behavior_df[raw_behavior_df["Airpuff"] == -1].index.tolist()
        
        # Grab start and stop values for Liquid Solenoid
        aligned["LiquidOn"] = raw_behavior_df[raw_behavior_df["Liquid"] == 1].index.tolist()
        aligned["LiquidOff"] = raw_behavior_df[raw_behavior_df["Liquid"] == -1].index.tolist()
        
        # Grab start and stop values for Speaker
        aligned["SpeakerOn"] = raw_behavior_df[raw_behavior_df["Speaker"] == 1].index.tolist()
        aligned["SpeakerOff"] = raw_behavior_df[raw_behavior_df["Speaker"] == -1].index.tolist()
        
        #TODO: This should be just one regex, not sure why complete isn't working...
        # Give name_date pattern for the files we're aligning
        name_date_pattern = "\d{8}_[A-Z]{3}\d{3}"
        
        # Perform the regex for the name and date of the file
        r_name_date = re.search(pattern=name_date_pattern, string=raw_file.name)
        
        # Give the pattern for the plane of interest
        plane_pattern = "_plane\d{1}"
        
        # Perform the regex for the plane number of the file
        r_plane = re.search(pattern=plane_pattern, string=raw_file.name)
        
        # Concatenate strings into final aligned file as json type
        aligned_name = r_name_date.group(0) + r_plane.group(0) + "_aligned.json"
        
        # Append the parent folder with this name to create the file later
        aligned_filename = parent_folder / aligned_name
        
        # Grab the config file for this plane from the parent folder
        config_glob = parent_folder.glob("*.json")
        
        # Config gather the config file result in a list
        config_file_result = [config for config in config_glob]
        
        # The config file is the only element of this list, not sure how to retain only the relevant file without lists...
        config_file = config_file_result[0]
        
        # Open the json file using json package 
        with open(config_file, "r") as inFile:
            
            # The configuration is the read file
            config = inFile.read()
            
            # Load the contents of the config with json.loads()
            config_contents = json.loads(config)
            
            # Gather the trial types from the configuration
            trial_types = config_contents["trialArray"]
        
        # Append trial types to aligned_file
        aligned["trialTypes"] = trial_types

        # Create the new file using json package
        with open(aligned_filename, "w") as outFile:
            
            # Use json.dump to write aligned_dictionary to file
            json.dump(aligned, outFile)
        
        # Tell user the file has been written
        print("Written", aligned_filename)
        aligned_list.append(aligned_filename)
        
    return aligned_list

aligned_list = align_2p_behavior(twop_raw_beh_list, twop_config_list)

Session already aligned: Y:\specialk\learned_helplessness\LHE011\twop\20210603\20210603_LHE011_plane0_aligned.json
Session already aligned: Y:\specialk\learned_helplessness\LHE011\twop\20210604\20210604_LHE011_plane0_aligned.json
Session already aligned: Y:\specialk\learned_helplessness\LHE011\twop\20210605\20210605_LHE011_plane0_aligned.json
Session already aligned: Y:\specialk\learned_helplessness\LHE011\twop\20210607\20210607_LHE011_plane0_aligned.json
Session already aligned: Y:\specialk\learned_helplessness\LHE011\twop\20210608\20210608_LHE011_plane0_aligned.json
Session already aligned: Y:\specialk\learned_helplessness\LHE011\twop\20210611\20210611_LHE011_plane0_aligned.json
Session already aligned: Y:\specialk\learned_helplessness\LHE011\twop\20210612\20210612_LHE011_plane0_aligned.json
Session already aligned: Y:\specialk\learned_helplessness\LHE012\twop\20210603\20210603_LHE012_plane0_aligned.json
Session already aligned: Y:\specialk\learned_helplessness\LHE012\twop\20210604\2

In [636]:
def gen_session_lick_psth(aligned_list, alignment_positions=[] window=5):
    
    window = window * 1000
    
    counter = 0

    for aligned_file in aligned_list:

        with open(aligned_file, "r") as inFile:

            contents = inFile.read()

            timestamps = json.loads(contents)

        trial_df = pd.DataFrame()
        
        lick_timestamps = []
        liquid_counter = 0
        airpuff_counter = 0

        # TODO: Each segment should be its own function call ie liquidon, speakers, airpuffon
        for (index, trial) in enumerate(timestamps["trialTypes"]):
            if trial == 1:
                liquid_counter += 1
                trial_name = "Trial_" + str(index + 1)
                trial_type = "Sucrose_" + str(liquid_counter)
                s = pd.Series(trial_type, dtype=str, name=trial_name)
                trial_df = trial_df.append(s)
            else:
                airpuff_counter += 1
                trial_name = "Trial_" + str(index + 1)
                trial_type = "Airpuff_" + str(airpuff_counter)
                s = pd.Series(trial_type, dtype=str, name=trial_name)
                trial_df = trial_df.append(s)
        
        trial_df.index.name = "trial"
        trial_df.columns = ["trial_type"]
        
        exclude_keys = ["LickOn", "LickOff", "LiquidOff", "AirpuffOff", "trialTypes"]
        
        for key in timestamps.keys():
            if key not in exclude_keys:
                trial_df[key] = np.nan
        
        speakeron_df = pd.DataFrame()
        
        for (index, trial) in enumerate(timestamps["SpeakerOn"]):
            trial_number = "Trial_" + str(index + 1)
            speaker_on_ms = trial
            s = pd.Series(speaker_on_ms, dtype=int, name=trial_number)
            speakeron_df = speakeron_df.append(s)
        
        speakeron_df.index.name = "trial"
        speakeron_df.columns = ["SpeakerOn"]
        
        trial_df.update(speakeron_df)
        
        speakeroff_df = pd.DataFrame()
        
        for (index, trial) in enumerate(timestamps["SpeakerOff"]):
            
            trial_number = "Trial_" + str(index + 1)
            speaker_off_ms = trial
            s = pd.Series(speaker_off_ms, dtype=int, name=trial_number)
            speakeroff_df = speakeroff_df.append(s)
        
        speakeroff_df.index.name = "trial"
        speakeroff_df.columns = ["SpeakerOff"]
        
        trial_df.update(speakeroff_df)
        
        trial_df.reset_index(inplace=True)
        trial_df.set_index("trial_type", inplace=True)

        
        liquid_df = pd.DataFrame()

        for (index, trial) in enumerate(timestamps["LiquidOn"]):
            
            liquid_trial = "Sucrose_" + str(index + 1)
            liquid_start_ms = trial
            s = pd.Series(liquid_start_ms, dtype=int, name=liquid_trial)
            liquid_df = liquid_df.append(s)
        
        liquid_df.index.name = "trial_type"
        liquid_df.columns = ["LiquidOn"]
        
        trial_df.update(liquid_df)
        

        airpuff_df = pd.DataFrame()

        for (index, trial) in enumerate(timestamps["AirpuffOn"]):
            
            airpuff_trial = "Airpuff_" + str(index + 1)
            airpuff_start_ms = trial
            s = pd.Series(airpuff_start_ms, dtype=int, name=airpuff_trial)
            airpuff_df = airpuff_df.append(s)
        
        airpuff_df.index.name = "trial_type"
        
        if airpuff_df.size > 0:
            airpuff_df.columns = ["AirpuffOn"]
            trial_df.update(airpuff_df)
        else:
            pass
        
        trial_df.reset_index(inplace=True)
        trial_df.set_index("trial", inplace=True)
        
        trial_df["Licks"] = np.nan
        
        # Use list comprehension for lick timestamps
        lick_timestamps = [lick for (index, lick) in enumerate(timestamps["LickOn"])]
        
        
        window_start_list = []
        window_end_list = []
        
        
        counter += 1
        
        if counter == 3:
            print(trial_df)
            print(lick_timestamps)
            break
    return trial_df


df = gen_session_lick_psth(aligned_list)

          trial_type  AirpuffOn   LiquidOn  SpeakerOn  SpeakerOff  Licks
trial                                                                   
Trial_1    Sucrose_1        NaN    31267.0    28979.0     31267.0    NaN
Trial_2    Sucrose_2        NaN    62181.0    60437.0     62180.0    NaN
Trial_3    Sucrose_3        NaN    98013.0    95535.0     98013.0    NaN
Trial_4    Sucrose_4        NaN   131047.0   129676.0    131047.0    NaN
Trial_5    Sucrose_5        NaN   160455.0   158009.0    160455.0    NaN
Trial_6    Sucrose_6        NaN   189489.0   185199.0    189489.0    NaN
Trial_7    Sucrose_7        NaN   220157.0   215185.0    220156.0    NaN
Trial_8    Sucrose_8        NaN   250916.0   249565.0    250915.0    NaN
Trial_9    Sucrose_9        NaN   276457.0   275249.0    276457.0    NaN
Trial_10  Sucrose_10        NaN   304317.0   302795.0    304317.0    NaN
Trial_11  Sucrose_11        NaN   336235.0   335209.0    336234.0    NaN
Trial_12  Sucrose_12        NaN   371498.0   368739

In [601]:

# tdf = df.query('trial_type.str.contains("Sucrose")', engine="python") query_example

#         >>> df1.set_index('Code', inplace=True)
# >>> df1.update(df2.set_index('Code'))
# newdf = newdf.reset_index().set_index("trial_type")
# sdf= sdf.reset_index()
# newdf = newdf.reset_index()
# # display(newdf, sdf)
# a = newdf["LiquidOn"].values
# b = sdf["LiquidOn"].values

# # display(a)
# # display(b)
# newdf = newdf.replace(b, a)

# # Selecting old value
# a = df['first_set'][4]
 
# # Selecting new value
# b = df1['first_set'][1]
 
# # replace values of one DataFrame with
# # the value of another DataFrame
# df = df.replace(a,b)
 
# # Display the Output
# display(df)
# newdf["LiquidOn"] = sdf[sdf["trial_type"].isin(newdf["trial_type"])]["LiquidOn"].values
# df1['Marks'] = df2[df2['Name'].isin(df1['Name'])]['Marks'].values
# df['second_set'] = df1.replace(df['first_set'],df['second_set'])
# newdf["LiquidOn"] = newdf.replace(newdf["LiquidOn"], sdf["LiquidOn"]) # works but gives index instead of values
# pandas.concat([X[X.columns - Y.columns], Y], axis=1)
# df1[df1['col'].str.contains('foo', regex=False)]
# df.loc[df.query(query).index(),'ColZ']=Z