# This code functions to organize a large number of unorganized data files by subject ID and data type. 

# User Input: 
Takes in the working directory where all the data files and subdirectories containing more data files are in. 
Takes in the study title. For example, "LL001_asdfihooiu.eeg" has subject_title LL, which stands for "latent learning", and 001 is the subject id.  

# Function Output: 
Organizes the files by creating a folder for each subject id. Within each subject id folder, there will be subfolders for each data type (eeg, behavioral, eye-tracking, GSR, etc) 
Relocates the files into corresponding subject folders. 


In [603]:
# This is to test the functions  
#User inputs: directory where all files are at, studytitle abbreviation used to identify a study, a dictionary 
#The dictionary's values are the strings contained in a file name that is to identify a specific data type 
#For example, "LL01_Eyes.csv" contains "Eyes" for eyetracking data. 
#The keys in the dictionary are the folder name that a user wants to name a folder after. 
#For example, if the user wants to name a folder as "eye" for all eye-tracking data labeled under "Eyes", then the key to "Eyes" is "eye" 
mypath = "/Users/vividlife/Desktop/Spring 18/PSC290 Python/Data for Testing/Run 1"
studytitle = "LL"
datType_dic={'eeg':'LTR',"behavioral":"Sample","eye":"Eyes"}
#Calls the main function and 
create_dataFolders(mypath,datType_dic,studytitle)


# See create_dataFolders() for details 

# This function calls create_idFolder()

This function is implemented after create_idFolder(). In other words, after all files have been moved to the right subject id folders, we use this function to create subfolder within each subject folder for every data type (eeg,behavioral, eyetracking, etc.), then move each file within the subject id folder to the correct subfolder of each data type. 
 
The value of each key can be 1. format , eg. ".eeg" ".csv" ".mat" 2. strings that indicate a data type, eg: "brainwave","eyes" 

This function recognizes the format or the strings, and create corresponding data type folders with names as the key specified by the user. 

In [604]:
def create_dataFolders(mypath,datType_dic,studytitle):

    import os 
    import glob 
    from os import listdir
    from os.path import isfile, join

    os.chdir(mypath)#work in the right directory 

    #fetch unique IDs from create_IDfolders() 
    unique_ID = create_idFolder(mypath,studytitle) 
    #print(unique_ID)

    folders_togoin=[]#saves all the paths for subject id folders 

    #find all paths for subject id folders 
    for id in unique_ID: #loop through each id folder  
        current_id_folder = os.path.join(mypath,str(id)) #current id folder that we work in 
        folders_togoin=folders_togoin+[current_id_folder] 
    #print(folders_togoin)

    #Go to each subject folder and create data category subfolders 
    for folder in folders_togoin: 
        os.chdir(folder)

        for key in datType_dic: #for each data category 
            if not os.path.exists(os.path.join(folder,key)):#create the data folder within each subject id folder if there isn't one (well,yes there is none, this is just easier for debugging)
                os.makedirs(os.path.join(folder,key)) 

    #Go to every subject folder again, check if an item is a file (not subdirectory), if is file, then move it to the right data category folder         
    for folder in folders_togoin: 
        os.chdir(folder)  
        onlyfiles = [f for f in listdir(folder) if isfile(join(folder, f))]#check if an item is a file 
        for goodfile in onlyfiles: #to eliminate the .DS_Store file...>_< 
            if studytitle in goodfile: #relevant file contains the studytitle, but .DS Store doesn't 
                for key in datType_dic: #loop through the dictionary, and find if the goodfile contains the any value to any key 
                    if datType_dic[key] in goodfile: 
                        oldname = os.path.join(folder,goodfile) 
                        newname = os.path.join(folder,key,goodfile)
                        os.rename(oldname,newname) #Move the file to the right data cat subfolder 



    return 


# See create_idFolder() for details 
# This function calls subID_return()
Search through every file in the directory and subdirectory 
For every file, check if it contains the study title 
If it does, then look for the subject id using subID_return function 
Check if the subject id folder has been created in the main working directory. If not, create one. 
Put the file into the folder.  

This function returns a list of the unique subject IDs for further reference

In [605]:
def create_idFolder(mypath,studytitle):
    import os 
    
    ID=[] #store all subject IDs 
    relevantFile_path=[] #to store all relevant files 
    label_file_id={}#a dictionary that labels each relevant file with a subject id. Key is file path  
    
    for path, subdirs, files in os.walk(mypath):
        for name in files:
            if studytitle in name: #if the study title applies->relevant file 

                relevantFile_path=relevantFile_path+[os.path.join(path,name)]#store that relevant file's path for further use: locate that file into the subject folder 
             
                ID=ID+[subID_return(name,studytitle)]
                label_file_id.setdefault(os.path.join(path,name),[subID_return(name,studytitle)]) #tag each file path with its ID 
                label_file_id[os.path.join(path,name)].append(name)#tag each file path with its file name 
                #so now each key (file path) has 2 entries, 1st entry: id, 2nd entry: file name 
                set_ID = set(ID)#convert it to set to get unique IDs
                unique_ID = list(set_ID)
                
    #print(unique_ID)
    #print(relevantFile_path)
    #print(label_file_id)
    #Now that we have all unique IDs, create a folder for each 
    for id in unique_ID: 
         
        newpath = os.path.join(mypath,id)
        if not os.path.exists(newpath):
            os.makedirs(newpath)
            
    #Put each relevant file into the corresponding newly created subject folder 
    for goodfilepath in relevantFile_path: 
        #print(label_file_id[goodfile])
        #print(goodfilepath)
        entries = label_file_id[goodfilepath] # two entries followed the key 
        filename = str(entries[1]) #filename in string 
        id=str(entries[0]) #id in string 
        newpath=os.path.join(mypath,id, filename) #create a new path for the file 
        #print(newpath)                    
        os.rename(goodfilepath, newpath) #move the file to its new path 
        
    return unique_ID

# See subID_return() for more details 


There are 3 possible cases: 
1. There is a study title, id follows the study title. E.g: "LL001.eeg" 
2. There is a study title, id does not follow the study title. E.g: "LLasdofi0_001.eeg" 
3. There is no study title. E.g: "001_asdf_1.eeg" 

For Case 2: this code treats the first appearance of number after the study title as the subject ID. For example "LLasdfoi001_2.eeg" will have subject ID 001, rather than 2, because 001 is the first appearance of number followed by "LL". 

For Case 3: this code treats the first appearance of number as the subject id.  

Read the first file's name. If the first entry is a number: case 3 
The first file should be the second file in the dirs list, since the first in the list is .DS_Store, which is irrelavant. 

In [606]:
def subID_return(filename, studytitle):
    first_f = filename #file in the working directory 

    subID="" #initialize subject id for the first file 

    if first_f[0].isdigit()==False: #check first entry is a number or not, if false, then case 1/case2 

        #search for the study title in the file name 
        idx_studytitle = first_f.find(studytitle) #returns the idx of the studytitle  
        idx_afterstudytitle = idx_studytitle+1 #the idx after study title. 
        idx_subidend=-1#idx that subject id ends 
        idx_subidstart=-1#idx that subject id starts 
        #search for the idx where subject id starts, search from 1 idx after where study title ends 
        for j in range(idx_afterstudytitle,len(first_f)):
            if first_f[j].isdigit()==True:  
                idx_subidstart=j 
                break
        #search for the idx where subject id ends 
        for k in range(idx_subidstart+1,len(first_f)): 
            if first_f[k].isdigit()==False:  
                idx_subidend=k 
                break 
        #Get subject id in string 
        subID=subID+first_f[idx_subidstart:idx_subidend] 

    else: #case 3  
        idx_subidend=-1# idx where subject id ends 
        for k in range(1,len(first_f)): #search from the second entry to the end 
            if first_f[k].isdigit()==False:  
                idx_subidend=k 
                break  
        #Get subject id in string 
        subID=subID+first_f[0:idx_subidend] # this is from the very beginning of the file name to where the id ends 

    return subID