# Multi .csv data reader and starter to perform tasks
First, zero-crossing data were preprocessed using `readAnabat()` from AnabatTools.R in R. The preprocessing returns .csv files that contains frequency (kHz) and time (microsecond).

In [1]:
import os
import glob
import pandas as pd

In [5]:
#loop .csv files to perform any task
#example return number of row

for filenames in glob.glob('../data/Multi_data_Trialsize/*.csv'):
    reader=pd.read_csv(filenames)
    nrow= reader.shape [0]
    print nrow

6208
12692
6058


In [6]:
#list of .csv files in the working directory
nfile=[]
for filenames in glob.glob('../data/Multi_data_Trialsize/*.csv'):
    nfile.append(filenames)

nfile

['../data/Multi_data_Trialsize\\P7132033_37.csv',
 '../data/Multi_data_Trialsize\\P7132035_14.csv',
 '../data/Multi_data_Trialsize\\P7132037_05.csv']

In [7]:
#example reading first nfile
print pd.read_csv(nfile[0]).head()

      Filename  Time  Frequency
0  P7132033_37    10        NaN
1  P7132033_37    99        NaN
2  P7132033_37   183    46242.0
3  P7132033_37   270    46783.0
4  P7132033_37   359    45454.0


# Processing zero-crossing files with bat.py
bat.py is in-house tools to extract zero crossing modified from ZCANT `extract_anabat()` function and to remove noise.

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import random
import os
import pandas as pd
import glob
import errno
from util.bat import *

# Set some Pandas options
pd.set_option('notebook_repr_html', True)
pd.set_option('max_columns', 30)
pd.set_option('max_rows', 20)

In [20]:

def get_batcall_csv(datadir,outdir):
    """
        Given a folder directory extract zero crossing data and create .csv files
    """
    
    filenames = glob.glob(datadir + '/*#')

    info=list()
    

    for filename in glob.glob(datadir + '/*#'):
        signal=list(extract_anabat(filename))
        signal.append(filename)
        info.append(signal)
    
    for i, item in enumerate (filenames):
        t=info[i][0]
        freq=info[i][1]
        ampl=info[i][2] #Empty array
        metadata=info[i][3]
        filename=np.repeat(info[i][4][-12:],len(t))
    
        columns=["Filename","Time", "Frequency"] #,"Label0","Label1","Label2"]
        batcall_df=pd.DataFrame({'Filename':filename,'Time':t,'Frequency':freq}, columns=columns) #,'Label0':label0,'Label1':label1,'Label2':label2}, columns=columns)
        batcall_df.to_csv(outdir + '/' + info[i][4][-12:] + ".csv", index=False)
    

In [8]:

def get_batcall2_csv(datadir,outdir):
    """
        Given a folder directory, extract zero crossing data, and create ONE .csv file with noise removal incorporated
    """

    filenames = glob.glob(datadir + '/*#')

    info=list()
    time=list()
    frequency=list()
    pulse=list()
    m=list()
    nrow=list()
    file_name=list()

    for filename in glob.glob(datadir + '/*#'):
        signal=list(extract_anabat(filename))
        signal.append(filename)
        info.append(signal) 
        
    for i, item in enumerate (filenames):
        t=info[i][0]
        freq=info[i][1]
        metadata=info[i][3]
        name=info[i][4][-12:]
        file_name.append(name)
        rn2=remove_noise2(t,freq)
        npulse=len(rn2)
    
        j=0
        while j < (npulse):
            t=zip(*rn2[j])[0]
            freq=zip(*rn2[j])[1]
            p=np.repeat(j+1,len(t))
            time.append(t)
            frequency.append(freq)
            pulse.append(p)
            j=j+1
            if j >= (npulse):
                break
        t_v=np.concatenate(time) #get nrow for each file in a folder
        l=len(t_v)
        m.append(l)
        n=m[:-1]
        n.insert(0,0)
        nrow=[i - j for i, j in zip(m, n)]
        
    
    batcall_df=pd.DataFrame({'Time':t_v, 'Frequency':np.concatenate(frequency), 'Pulse':np.concatenate(pulse)}, columns=["Time", "Frequency","Pulse"])
    filename_v=np.repeat(file_name,nrow) #vector of filenames in the folder repeated nrow times accordingly
    batcall_df.insert(loc=0, column='Filename', value=filename_v) 
    ix=datadir.find('cell')
    cn=datadir[ix:]
    name=cn.replace('/','_')
    batcall_df.to_csv(outdir + '/' + name + ".csv", index=False) 

In [6]:
os.getcwd()   

'C:\\Users\\bty\\OneDrive - UNCG\\Fall2018\\CSC505\\Bat_Echolocation\\src'

In [6]:
#Create outdir in current directory

path='./outfun2'
try:
    os.mkdir(path)
except OSError as e:
    if e.errno != errno.EEXIST:
        raise

In [9]:
datadir='C:/Users/bty/OneDrive - UNCG/Fall2018/CSC505/Project/transects/transects/2015/cell-0/20150717'
outdir=datadir
get_batcall2_csv(datadir,outdir)

## Summary Dataframe for processed zero-crossing

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

# Set some Pandas options
pd.set_option('notebook_repr_html', True)
pd.set_option('max_columns', 30)
pd.set_option('max_rows', 20)

In [4]:

def get_summary(filepath):
    """
        Given .csv of processed zero-crossing files, create summary dataframe; return summary_df in console only
    """
    batcall_df=pd.read_csv(filepath,sep=',')   
    npulse= batcall_df.groupby(['Filename'])['Pulse'].nunique().reset_index() #get number of pulse per filename
    filename_v=np.repeat(list(npulse['Filename']),list(npulse['Pulse']))
    p= batcall_df.groupby(['Filename'])['Pulse'].unique() #get list of pulse per filename
    pulse=np.concatenate(p)  
    fmed= batcall_df.groupby(['Filename','Pulse'])['Frequency'].median().aggregate(lambda x: map(float, list(x)))
    fcount= batcall_df.groupby(['Filename','Pulse'])['Frequency'].size().aggregate(lambda x: map(int, list(x))) #count f-points for each pulse
    batcall_df['fmedian']= np.repeat(fmed,fcount)
    batcall_df['ad']=abs(batcall_df['Frequency']-batcall_df['fmedian'])
    mad=batcall_df.groupby(['Filename','Pulse'])['ad'].median().aggregate(lambda x: map(float, list(x)))
    tmin= batcall_df.groupby(['Filename','Pulse'])['Time'].min().aggregate(lambda x: map(float, list(x)))
    tmax= batcall_df.groupby(['Filename','Pulse'])['Time'].max().aggregate(lambda x: map(float, list(x)))
    tmin_idx= batcall_df.groupby(['Filename','Pulse'])['Time'].idxmin().aggregate(lambda x: map(int, list(x)))
    fstart=[batcall_df.iloc[i,2] for i in tmin_idx]
    fmax= batcall_df.groupby(['Filename','Pulse'])['Frequency'].max().aggregate(lambda x: map(float, list(x))) #not used
    tmax_idx= batcall_df.groupby(['Filename','Pulse'])['Time'].idxmax().aggregate(lambda x: map(int, list(x)))
    fend=[batcall_df.iloc[i,2] for i in tmax_idx]
    fmin= batcall_df.groupby(['Filename','Pulse'])['Frequency'].min().aggregate(lambda x: map(float, list(x))) #not used
    l=len(fend)
    
    dur=[]
    for i in range (l):
        if fstart[i]>fend[i]:
            u=-1*(tmax[i]-tmin[i])
            dur.append(u)
        else:
            u=tmax[i]-tmin[i]
            dur.append(u)
       
    summary_df= pd.DataFrame({'Filename':filename_v, 'dur':dur, 'fstart':fstart, 'fend':fend, 'fmedian':fmed, 
                           'mad':mad, 'Pulse':pulse},
                           columns=["Filename","Pulse","dur","fstart","fend","fmedian","mad"])
    return summary_df

In [11]:
filepath='../data/cell-0_20150717.csv'
get_summary(filepath)

Unnamed: 0,Filename,Pulse,dur,fstart,fend,fmedian,mad
0,P7172114.09#,1,-0.005676,37037.037037,35714.285714,35955.237749,482.703138
1,P7172114.09#,2,-0.011092,40000.000000,34782.608696,35714.285714,626.566416
2,P7172114.09#,3,-0.008363,39024.390244,34334.763948,35555.555556,808.080808
3,P7172114.09#,4,-0.007722,44444.444444,35714.285714,37914.691943,1551.055579
4,P7172114.09#,5,-0.010863,42328.042328,34482.758621,35555.555556,922.871897
5,P7172114.09#,6,-0.008642,45454.545455,34934.497817,36447.039199,891.483644
6,P7172114.09#,7,-0.011618,38277.511962,33333.333333,34334.763949,625.068998
7,P7172114.09#,8,-0.113062,41025.641026,34188.034188,35049.413928,664.871786
8,P7172114.09#,9,-0.011548,39215.686274,33613.445378,34520.077624,906.632245
9,P7172114.09#,10,-0.011764,37383.177570,33898.305085,34782.608696,594.574508
