# Data processing
## Get class information from file names

In [None]:
import datetime
from pathlib import Path

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

In [None]:
# Set root directory here.
root_dir = 'data'

# Provide list of valid dog names and handle any mis-spellings.
dog_names = { 
  'Rex': 'Rex',
  'rix': 'Rex'
}

# List of codes used to identify the position of the positive scent-sample.
positions = { 
    '_T1': 'T1',
    '_T2': 'T2',
    '_T3': 'T3',
    '_B': 'B'
}
    
# List of directories to exclude.    
exclude_dirs = {
    'insert_dir_name': 'Insert reason for exclusion here'
}

# Exclude files if the file name contains any of the text below.
exclude_file_text = {
    'insert_text': 'Insert reason for exclusion here'
}


def dog_name(file_name):
    ''' Return dog name or empty string if not found. '''
    this_dog = ''
    for name in dog_names:
        if file_name.find(name) >= 0:
            this_dog = dog_names[name]
            break
    return this_dog


def position(file_name):
    ''' Return the position of the positive sample or empty string if not found. '''
    this_position = ''
    for p in positions:
        if file_name.find(p) >= 0:
            this_position = positions[p]
            break
    return this_position


def exclude_dir(file):
    ''' Return true if this file should be excluded based on its directory. '''   
    # Is this directory in the exclusion list?
    for d in exclude_dirs:
        if file.match('*/'+d+'/*') or file.match('*/'+d+'/**/*'):
            return True


def exclude_file(file):  
    ''' Return true if this file should be excluded. '''             
    # Does the file name include text that is in the exclusion list?
    for t in exclude_file_text:
        if file.name.find(t) >= 0:
            return True   
    return False


def last_three_underscores(file_name):
    ''' Return the positions of the last three underscores in the file name. '''
    do_print = False
    if do_print: print(file_name)
    n = len(file_name)
    last = file_name.rfind('_',0,n)
    if do_print: print('last',last)
    last_m1 = file_name.rfind('_',0,last)
    if do_print: print('last_m1',last_m1)
    last_m2 = file_name.rfind('_',0,last_m1)
    if do_print: print('last_m2',last_m2)
    return last_m2, last_m1, last


def run_and_pass_no(file_name):
    ''' Return the run number and pass number from the file name or return empty if not found.'''
    last_m2, last_m1, last = last_three_underscores(file_name)
    pass_no = file_name[last_m1+1:last]
    if not pass_no.isdigit():
        print(file_name, 'Pass number not found. Will try to handle any trailing underscore.')
        # Handle files with trailing underscore
        if file_name[last] == '_':
            file_name = file_name[:last]
            last_m2, last_m1, last = last_three_underscores(file_name)
            pass_no = file_name[last_m1+1:last]
    pass_no = int(pass_no)

    run_no = file_name[last_m2+1:last_m1]
    if not run_no.isdigit():
        print(file_name)
    run_no = int(run_no)

    if pass_no > 4 or pass_no < 1 or run_no < 1: 
        print('############ Pass no:', pass_no, 'Run no:', run_no, 'for file', file_name, '######################################')
        run_no = ''
        pass_no = ''
    return run_no, pass_no


def timestamp(file_name):
    ''' Return timestamp from the file_name. '''
    time_stamp = file_name[:16]
    time_stamp = datetime.datetime.strptime(time_stamp, '%Y_%m_%d-%H_%M')
    return time_stamp


def class_info():
    ''' Get class info from file names. Print this info and print a list of files
        where the info could not be found. '''

    print(root_dir)

    files_skipped =[]
    good_files_data = []

    files = Path(root_dir).rglob('*.csv')
    for file in files:
        file_name = file.name
        
        # Exclude certain directories and files containing certain text.
        if exclude_dir(file):
            files_skipped.append((file,'directory excluded'))   
            continue
        if exclude_file(file):
            files_skipped.append((file,'file excluded'))   
            continue
        
        # Get the dog's name from the file name.
        this_dog = dog_name(file_name)
        if not this_dog:
            #print('Dog name not found for file', file_name)
            files_skipped.append((file, 'dog name not found'))   
            continue

        this_position = position(file_name)
        if not this_position:
            #print('Position not found for file', file_name)
            files_skipped.append((file, 'position of sample not found'))
            continue

        run_no, pass_no = run_and_pass_no(file_name)
        if not run_no or not pass_no:
            #print('Run or pass number not found for file', file_name)
            files_skipped.append((file, 'run or pass number not found'))
            continue

        time_stamp = timestamp(file_name)
        good_files_data.append((file_name, time_stamp, this_dog, run_no, pass_no, this_position))
        
    # Print out what was found.
    print('done')
    good_files = pd.DataFrame(good_files_data, columns=['file_name', 'timestamp', 'dog', 'run', 'pass', 'position'])
    skipped_files = pd.DataFrame(files_skipped, columns=['file', 'reason'])
    print('number of good_files', good_files.count())
    print('number of skipped_files', skipped_files.count())
    print('The reasons for excluding files are', skipped_files.reason.unique())
    
    print('The files where reason is - position of sample not found')
    df = skipped_files['file'][skipped_files['reason']=='position of sample not found']
    for r in df:
        print(r)

    print('The files where reason is - run or pass number not found')
    df = skipped_files['file'][skipped_files['reason']=='run or pass number not found']
    for r in df:
        print(r)

    print('The files where reason is - file excluded')
    df = skipped_files['file'][skipped_files['reason']=='file excluded']
    for r in df:
        print(r)

    # Save to file
    #good_files.to_pickle('good.pkl')
    #skipped_files.to_pickle('skipped.pkl')



class_info()



# Plot meta data

In [None]:
# Read in data files and plot the data.
good = pd.read_pickle('good.pkl')
skip = pd.read_pickle('skipped.pkl')
plt.figure(1)
good['pass'].value_counts().plot(kind='bar')
plt.suptitle('Pass number')
plt.figure(2)
good['dog'].value_counts().plot(kind='bar')
plt.suptitle('Dog name')
plt.figure(3)
good['position'].value_counts().plot(kind='bar')
plt.suptitle('Position of positive sample (or B for no positive sample)')
plt.figure(4)
good['run'].value_counts().plot(kind='bar')
plt.suptitle('Run number')
plt.figure(5)
good['pass'].value_counts().plot(kind='bar',by='dog')
plt.suptitle('Pass number')


In [None]:
axes = good.hist(column='pass', by='dog', layout=(1,3))
axes[0].set_ylim(0,200)
axes[1].set_ylim(0,200)
axes[2].set_ylim(0,200)

In [None]:
axes = good.hist(column='run', by='dog', layout=(1,3))
upper = 50
axes[0].set_ylim(0,upper)
axes[1].set_ylim(0,upper)
axes[2].set_ylim(0,upper)

In [None]:
axes = good.hist(column='pass', by='position', layout=(1,4))
upper = 140
axes[0].set_ylim(0,upper)
axes[1].set_ylim(0,upper)
axes[2].set_ylim(0,upper)
axes[3].set_ylim(0,upper)
upper = 4
axes[0].set_xlim(1,upper)
axes[1].set_xlim(1,upper)
axes[2].set_xlim(1,upper)
axes[3].set_xlim(1,upper)

# Read pressure sensor data

In [None]:
root_dir = '18.11.06'
file = Path(root_dir+'/file_name.csv')
df = pd.read_csv(file, header=None)
df = df.T
#print(df.describe)
print(df.shape[0])

plt.figure()
df.plot(subplots=True, ylim=(0,3), yticks=(0,1,2,3), legend=False, color='steelblue')



In [None]:
good = pd.read_pickle('good.pkl')
good.head()
skip.head()


In [None]:
#n = good.count(axis=0, level='file_name')
n = good.shape[0]
for i in range(0,n):
    file = good.at[i,'file']
    print(file)

# Data summary
number of good files 687

number of skipped files 725


Length of time series in each file - statistics:

|name |value
|:---|:--- 
|count |     687.000000 
|mean  |    5694.222707
|std   |    4624.818179
|min   |    1600.000000
|5%    |    2613.000000
|25%   |    3413.000000
|50%   |    4373.000000
|75%   |    6400.000000
|95%   |    11829.400000
|max   |   55786.000000



# Read in data from raw files

In [None]:
def import_data():
    ''' Read in the list of good files. For each file, get the three pressure
    sensor data samples, add class information (e.g. +ve or -ve scent sample) 
    to the first column(s). Save the entire dataset in a csv file. '''
    # User inputs
    do_save = False
    output_file = 'dataset.csv'
    max_cols = 11000
    good = pd.read_pickle('good.pkl')

    n = good.shape[0]
    class_cols = 1 # How many columns will be added to hold class data.
    print('number of good_files', n)
    data = np.empty((n*3,max_cols+class_cols))
    for i in range(0,n):
        file = good.at[i,'file']
        d_i = np.loadtxt(file,delimiter=',')
        assert(d_i.shape[0]==3)
        assert(d_i.shape[1]>100)
        # Set the number of columns by truncating or padding with zeros.
        cols = d_i.shape[1]
        print(i)
        if cols > max_cols:
            d_i = d_i[:,:max_cols]
        elif cols < max_cols:
            d_i = np.pad(d_i,((0,0),(0,max_cols-cols)),mode='constant',constant_values=0)
        # Pop class data into the first column.
        position = good.at[i,'position']
        classes_i = class_vector(position)
        d_i = np.hstack((classes_i,d_i))
        # Add this data to the data set.
        data[i*3:i*3+3] = d_i

    print(data.shape)
    if do_save:
        print('Saving data to', output_file)
        np.savetxt(output_file, data, delimiter=',')

def class_vector(position):
    dict = {
        'T1': np.array(([1],[0],[0])),
        'T2': np.array(([0],[1],[0])),
        'T3': np.array(([0],[0],[1])),
        'B': np.array(([0],[0],[0]))
    }
    return dict[position]

import_data()

# Analyse dataset

In [None]:
def plot_dataset():
    df = pd.read_csv('dataset.csv', header=None)
    print(df.shape)
    print('min:',np.min(df.iloc[:][1:].values))
    print('max:',np.max(df.iloc[:][1:].values))
    print('mean:',np.mean(df.iloc[:][1:].mean()))
    plt.figure(1)
    df[0].value_counts().plot(kind='bar')
    plt.suptitle('0: Negative scent sample, 1: Positive scent sample')
    plt.show()
    plt.figure(2)
    i = 40
    df.iloc[i][1:].plot()
    df.iloc[i+100][1:].plot()   
    df.iloc[i+200][1:].plot() 
    plt.suptitle('Example data')
    plt.legend(loc='upper right')
    plt.show()
    

plot_dataset()

In [None]:
output = 'test_data/raw_data/2017_11_06-11_42-Rex_1_1_T1.csv'
d1 = np.zeros((3,12000))
d1[0][700:1700]=1
d1[1][2000:2500]=1
d1[2][4000:4500]=1
print('Saving data to', output)
np.savetxt(output, d1, delimiter=',')