# Data Grepping Notebook
### Jeremy Delahanty June 2021

Intended to grep different files/projects/datasets from user input and retain them for use in analysis/display later. The lack of unified filenaming structures between projects will break the code... A convetion of XXX### for animal names, or something similar, should be adopted for all animals in the lab.

In [1]:
# Import packages
from pathlib import Path
import pathlib
import glob
import re

In [2]:
lab_basepath = "Y:/"
# project_dict = {"specialk": ["learned_helplessness", "chronic_mild_stress"]}

In [3]:
def grep_teams(team_selection=[], lab_basepath="Y:/"):
    """
    Grabs team list from server based on user's input.
    
    User can define which teams they want to use for their analyses and
    the function will glob the paths for their selection.
    
    Parameters
    ----------
    arg1: list
        List of strings for teams of interest
        Default is empty list
    arg2: string
        Basepath for server location on machine
        Default is Y:/ for mapped Windows drive
    
    
    Returns
    -------
    1. list
        List of team path grabbed from server successfully
    2. list
        List of teams not found
    """

    # Take basepath and glob all available files and directories
    team_glob = Path(lab_basepath).glob("{}".format("*"))
    
    # The selected teams are what the user provides as team_selection
    selected_teams = team_selection
    
    # Check if no team was specifically asked for, tell user we're gathering all teams
    if selected_teams == []:

        print("Gathering all teams...")

        # List comprehension for returning all directories in Tye Lab server
        team_list = [team for team in team_glob if team.is_dir()]

    else:
        
        # List comprehension for returning only directories user wants in the Tye Lab server 
        team_list = [team for team in team_glob if team.name in selected_teams and team.is_dir()]
    
    # Create temporary list for checking if selected teams exist
    tmp = []

    # For the teams that were globbed successfully, append the team to the temp list
    for globbed_team in team_list:
        tmp.append(globbed_team.name)
    
    # Compare team selection with returned teams using sets, convert to list
    missing_teams = list(set(selected_teams) - set(tmp))
    
    # If the missing_teams list is empty, the program found all requested teams
    if missing_teams == []:
        print("Found All Selected Teams")
    
    # Else, some teams weren't found. Tell the user which teams weren't found.
    else:
        print("Failed to find team(s):", missing_teams)
   
    # Show user which teams were returned
    print("Teams Returned:")
    for team in team_list:
        print("{} ".format(team.name))
    
    # Return the list of projects gathered
    return team_list, missing_teams

team_list, missing_teams = grep_teams(["specialk", "Kyle"])

Found All Selected Teams
Teams Returned:
Kyle 
specialk 


In [4]:
def choose_projects(team_list, project_selection={}):
    """
    Generates project paths list based on user's selection.
    
    User can define which project they want to use for their analyses and
    this function generates the paths for their selection.
    
    Parameters
    ----------
    arg1: list
        List of strings for teams of interest from grep_teams()
    arg2: dict
        Dictionary of values that will be used to create specific
        paths for selected teams and their projects
    
    Returns
    -------
    1. list
        List of team/project Paths to grep in next steps
    """

    # Make dictionary using the teams in team_list as keys
    project_dict = {team: [] for team in team_list}

    # For each time in the team_list, append the Path name's project's values
    for team in team_list:
        project_dict[team].append(project_selection[team.name])
    
    # Make empty project list
    project_dir_list = []

    for team in project_dict.keys():
        for project in range(len(project_dict[team][0])):
            project_dir_list.append(team / project_dict[team][0][project])
    
    print("Returned Directories: ")

    for directory in project_dir_list:
        print(directory)
    
    return project_dir_list
    
project_list = choose_projects(team_list, project_selection={"specialk": ["learned_helplessness", "chronic_mild_stress"], "Kyle": ["test"]})

Returned Directories: 
Y:\Kyle\test
Y:\specialk\learned_helplessness
Y:\specialk\chronic_mild_stress


In [30]:
def choose_animals(project_list, animal_group="all"):
    """
    Generates animal paths list based on user's selection.
    
    User can define which cohort of animals they want to use 
    for their analyses. This function generates the paths for 
    their selection that meet specified conditions.
    
    Parameters
    ----------
    arg1: list
        List of strings for projects of interest from choose_projects()
    arg2: str
        String of value for which animal paths to gather.
        Default value is all.
    
    Returns
    -------
    1. list
        List of team/project/animal Paths to grep in next steps
    """
    
    # Create empty animal list for path generation
    animal_list = []
    
    # If the animal group is left as default/specified as all, grab all animals
    if animal_group == "all":
        print("Grabbing all animals...")
        
        # For each project directory in the project list
        for project_dir in project_list:
            
            # For each animal globbed in the project directory
            for animal in project_dir.glob("*"):
                
                # Append the animal's path to the animal_list
                print(project_dir.name, animal.name)
                animal_list.append(animal)
    
    # Else, only select animals from the specified group
    else:
        print("Grabbing only {} animals...".format(animal_group))

        # Format the animal group with the user's input
        animal_group = "[A-Z]{2}" + animal_group + "\d{3}"
        
        # For each project_directory in project_list
        for project_dir in project_list:

            # For each animal globbed in project directory
            for animal in project_dir.glob("*"):
                
                # Use regex to grab only the requested animal
                r = re.search(animal_group, string=animal.name)
                
                # If the search returns None, the animal didn't match the request
                # Skip over it with pass.
                if r is None:
                    pass
                
                # If something is returned, take the match object's value and append
                # the animal to the project directory.
                else:
                    print(project_dir.name, r.group(0))
                    animal_list.append(project_dir / r.group(0))
    
    # Finally, return the list of animals
    return animal_list

animal_list = choose_animals(project_list, animal_group="E")

Grabbing only E animals...
learned_helplessness LHE011
learned_helplessness LHE012
learned_helplessness LHE013
learned_helplessness LHE014
learned_helplessness LHE015
learned_helplessness LHE016
chronic_mild_stress CSE001


In [13]:
def choose_data(animal_list, data_group=[], verbose=True):
    """
    Generates animal's data paths list based on user's selection.
    
    User can define which dataset to use for the animals they 
    want to use for their analyses. This function generates the
    paths for their selection that meet specified conditions.
    
    Parameters
    ----------
    arg1: list
        List of paths for animals of interest from choose_animals()
    arg2: list
        List of strings for which datasets to gather.
        Default value is all.
    arg3: bool
        Boolean argument for verbose output of paths found or
        not found by the function. Default is True.
    
    Returns
    -------
    1. list
        List of team/project/animal/dataset Paths to grep in
        next steps
    """
    
    # Create empty data list for path generation
    data_list = []
    
    # If data_group is left as default or specified as empty,
    # grab all folders
    if data_group == []:
        print("Grabbing all data folders...")
        
        # For each animal in the animal list
        for animal in animal_list:
            
            # For the data_dir in the globbed animal_path
            for data_dir in animal.glob("*"):
                
                # Append the data_dir to the data_list
                print("Grabbing", animal.name, data_dir.name)
                data_list.append(data)
    
    #TODO: Make verbose into its own function
    elif len(data_group) > 0 and verbose is True:
        print("Grabbing...")
        for data_type in data_group:
            print(data_type, "data")

        print("\nFrom Projects(s)...")
        project_list = list(set([project.parent.name for project in animal_list]))
        for project in project_list:
            print(project)
            
        print("\nIn Team(s)...")
        team_list = list(set([team.parent.parent.name for team in animal_list]))
        for team in team_list:
            print(team)
        print("\nFor Animals...")
        for animal in animal_list:
            print(animal.name)
        
        print("\nChecking for directories...")
        for animal in animal_list:
            for data_type in data_group:
                if (animal / data_type).is_dir():
                    print("Found", animal.name, data_type)
                    data_list.append(animal / data_type)
                else:
                    print("Not Found!", animal.name, data_type)
    else:
       
        #TODO: Write a function for checking
        print("Grabbing specified directories...\n")
        
        for animal in animal_list:
            for data_type in data_group:
                if (animal / data_type).is_dir():
                    data_list.append(animal / data_type)
                else:
                    print(animal.name, data_type, "Not Found!")
    
    # Tell user which directories were returned
    print("\nReturning Directories:")
    for data_dir in data_list:
        print(data_dir)

    return data_list
                

data_list = choose_data(animal_list, data_group=["twop", "3chamber"], verbose=False)

Grabbing specified directories...

LHC001 twop Not Found!
LHC001 3chamber Not Found!
LHE012 3chamber Not Found!
LHE013 3chamber Not Found!
LHE014 3chamber Not Found!
LHE015 3chamber Not Found!
LHE016 3chamber Not Found!
CSC001 twop Not Found!
CSC001 3chamber Not Found!

Returning Directories:
Y:\specialk\learned_helplessness\LHE011\twop
Y:\specialk\learned_helplessness\LHE011\3chamber
Y:\specialk\learned_helplessness\LHE012\twop
Y:\specialk\learned_helplessness\LHE013\twop
Y:\specialk\learned_helplessness\LHE014\twop
Y:\specialk\learned_helplessness\LHE015\twop
Y:\specialk\learned_helplessness\LHE016\twop
