# Data processing

In [None]:
import datetime
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

# Set root directory here.
root_dir = 'data'

dog_names = { 
  'Rex': 'Rex',
  'rex': 'Rex'
}

positions = { 
    '_T1': 'T1',
    '_T2': 'T2',
    '_T3': 'T3',
    '_B': 'B'
}
    
exclude_dirs = {
    'insert_dir_name': 'Insert reason for exclusion here'
}

exclude_file_text = {
    'insert_test': 'Insert reason for exclusion here'
}


def dog_name(file_name):
    ''' Return dog name or empty string if not found. '''
    this_dog = ''
    for name in dog_names:
        if file_name.find(name) >= 0:
            this_dog = dog_names[name]
            break
    return this_dog


def position(file_name):
    ''' Return the position of the positive sample or empty string if not found. '''
    this_position = ''
    for p in positions:
        if file_name.find(p) >= 0:
            this_position = positions[p]
            break
    return this_position


def exclude_dir(file):
    ''' Return true if this file should be excluded based on its directory. '''   
    # Is this directory in the exclusion list?
    for d in exclude_dirs:
        if file.match('*/'+d+'/*') or file.match('*/'+d+'/**/*'):
            return True


def exclude_file(file):  
    ''' Return true if this file should be excluded. '''             
    # Does the file name include text that is in the exclusion list?
    for t in exclude_file_text:
        if file.name.find(t) >= 0:
            return True   
    return False


def last_three_underscores(file_name):
    ''' Return the positions of the last three underscores in the file name. '''
    do_print = False
    if do_print: print(file_name)
    n = len(file_name)
    last = file_name.rfind('_',0,n)
    if do_print: print('last',last)
    last_m1 = file_name.rfind('_',0,last)
    if do_print: print('last_m1',last_m1)
    last_m2 = file_name.rfind('_',0,last_m1)
    if do_print: print('last_m2',last_m2)
    return last_m2, last_m1, last


def run_and_pass_no(file_name):
    ''' Return the run number and pass number from the file name or return empty if not found.'''
    last_m2, last_m1, last = last_three_underscores(file_name)
    pass_no = file_name[last_m1+1:last]
    if not pass_no.isdigit():
        print(file_name, 'Pass number not found. Will try to handle any trailing underscore.')
        # Handle files with trailing underscore
        if file_name[last] == '_':
            file_name = file_name[:last]
            last_m2, last_m1, last = last_three_underscores(file_name)
            pass_no = file_name[last_m1+1:last]
    pass_no = int(pass_no)

    run_no = file_name[last_m2+1:last_m1]
    if not run_no.isdigit():
        print(file_name)
    run_no = int(run_no)

    if pass_no > 4 or pass_no < 1: # 4 passes do occur 
        print('############ Error suspect pass no is too high at ', pass_no, 'for file', file_name, '######################################')
        run_no = ''
        pass_no = ''
    return run_no, pass_no


def timestamp(file_name):
    ''' Return timestamp from the file_name. '''
    time_stamp = file_name[:16]
    time_stamp = datetime.datetime.strptime(time_stamp, '%Y_%m_%d-%H_%M')
    return time_stamp


def process_files():
    ''' Get class info from file names. Print this info and print a list of files
        where the info could not be found. '''

    print(root_dir)

    files_skipped =[]
    good_files_data = []

    files = Path(root_dir).rglob('*.csv')
    for file in files:
        file_name = file.name
        
        # Exclude certain directories and files containing certain text.
        if exclude_dir(file):
            files_skipped.append((file,'directory excluded'))   
            continue
        if exclude_file(file):
            files_skipped.append((file,'file excluded'))   
            continue
        
        # Get the dog's name from the file name.
        this_dog = dog_name(file_name)
        if not this_dog:
            #print('Dog name not found for file', file_name)
            files_skipped.append((file, 'dog name not found'))   
            continue

        this_position = position(file_name)
        if not this_position:
            #print('Position not found for file', file_name)
            files_skipped.append((file, 'position of sample not found'))
            continue

        run_no, pass_no = run_and_pass_no(file_name)
        if not run_no or not pass_no:
            #print('Run or pass number not found for file', file_name)
            files_skipped.append((file, 'run or pass number not found'))
            continue

        time_stamp = timestamp(file_name)
        good_files_data.append((file_name, time_stamp, this_dog, run_no, pass_no, this_position))
        
    # Print out what was found.
    print('done')
    good_files = pd.DataFrame(good_files_data, columns=['file_name', 'timestamp', 'dog', 'run', 'pass', 'position'])
    skipped_files = pd.DataFrame(files_skipped, columns=['file', 'reason'])
    print('number of good_files', good_files.count())
    print('number of skipped_files', skipped_files.count())
    print('The reasons for excluding files are', skipped_files.reason.unique())
    
    print('The files where reason is - position of sample not found')
    df = skipped_files['file'][skipped_files['reason']=='position of sample not found']
    for r in df:
        print(r)

    print('The files where reason is - run or pass number not found')
    df = skipped_files['file'][skipped_files['reason']=='run or pass number not found']
    for r in df:
        print(r)

    print('The files where reason is - file excluded')
    df = skipped_files['file'][skipped_files['reason']=='file excluded']
    for r in df:
        print(r)

    # Save to file
    #good_files.to_pickle('good.pkl')
    #skipped_files.to_pickle('skipped.pkl')



process_files()



In [None]:
# Read in data files and plot the data.
good = pd.read_pickle('good.pkl')
skip = pd.read_pickle('skipped.pkl')
good.hist(column='pass', bins=4)
plt.figure(0)
good['dog'].value_counts().plot(kind='bar')