# Import data methods:
1. Reading zero-crossing file and extract time, frequency, amplitude and metadata information.(preferred)

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from util.bat import *
import random
import os
import pandas as pd

# Set some Pandas options
pd.set_option('notebook_repr_html', True)
pd.set_option('max_columns', 30)
pd.set_option('max_rows', 20)

In [2]:
# Call extract_anabat from util.anabat to extract time, frequency, amplitude and partial metadata from zero-crossing file
info=extract_anabat('../data/P7132033.37#')  # P7132035.14#  P7132033.37#  ../data/social/S8072135.07#
info2=extract_anabat('../data/social/S8072135.07#') 

TypeError: a bytes-like object is required, not 'str'

In [None]:
t=info[0]
freq=info[1]
ampl=info[2]
metadata=info[3]

In [None]:
metadata

In [None]:
t[-1]-t[0]

In [None]:
#visualization
plt.figure(figsize=(12,3))
plt.scatter(t, freq, s=2)
plt.xticks(np.arange(1, 1.6, 0.05))
plt.xlim(1, 1.5)
plt.title("scatter")
plt.grid()

2.Read from .csv files of time and freq that decoded using R.

In [None]:
# Read time and frequency data from a csv sample data file. 
# The data in csv file is processed using R from Bat acoustic recording (zero-crossing file) encoded by Anabat file format 132.(AnabatTools.R--see R codes for details)
import csv

call_list = list(csv.reader(open('../data/P7132033_37.csv', 'rb')))[1:]
#print call_list

# Remove noise and extract valid pulse signals 

1. Call remove_noise function from util.anabat to remove the noises of original zc file.
2. Get the returned valid pulses.
3. Use display_pulses method from util.anabat to plot a few random sample of the valid pulses.

In [None]:
# remove noise 
pulses=remove_noise(t,freq)

In [None]:
# length of pulses in a given zc file
num = len(pulses)
num 

In [None]:
# plot random pulses sample
display_pulses(pulses, nrows=4, ncols=4, figsize=(10,8))

# Search abnormal labeled data files in a folder

Call get_labeled_file util method in anabat module.

In [None]:
# Example of find bat acoustic files contains social call (have "social" label in metadata)
datadir='D:\\dropbox_backup\\acoustic social auto id\\bat recordings\\transects\\2018\\cell-75\\20180715\\'
label='social'
get_labeled_file(datadir,label)

# Descriptive Analysis

15kHz-120kHz

5% of social calls.

In [None]:
data2015 = pd.read_csv('../data/2015_night.txt', sep="\t")
data2016 = pd.read_csv('../data/2016_night.txt', sep="\t")
data2017 = pd.read_csv('../data/2017_night.txt', sep="\t")
data2018 = pd.read_csv('../data/2018_night.txt', sep="\t")

In [None]:
data2018[10:30]

In [None]:
data2015['Label'].unique

In [None]:
def count_recordings(df, label=None):
    """
     Given a data table and the label, return the counts in this table with certain label
     label=None --- find the total recording counts
     label=social --- find the recording count for files containing 'social' label
     label=foraging --- find the recording count for files containing 'foraging' label
     label=Feeding --- find the recording count for files containing 'Feeding' label
     
     """
    if label=='social':
        count=df[df['Label'].str.contains('social')==True]['Number'].sum()
    elif label=='foraging':
        count=df[df['Label'].str.contains('foraging')==True]['Number'].sum()
    elif label=='Feeding':
        count=df[df['Label'].str.contains('Feeding')==True]['Number'].sum()
    else:
        count=df[df['Label'].str.contains('social|foraging|Feeding')==False]['Number'].sum()
    
    return count
    

In [None]:
# how many bat recordings in 2015
count_recordings(data2015)

In [None]:
# how many social labeled recording in 2015
count_recordings(data2015,'social')

In [None]:
subt2015=count_recordings(data2015)
subt2016=count_recordings(data2016)
subt2017=count_recordings(data2017)
subt2018=count_recordings(data2018)
#total sample size
totalss=subt2015+subt2016+subt2017+subt2018
totalss

In [None]:
totalsocial=count_recordings(data2015,'social')+count_recordings(data2016,'social')+count_recordings(data2017,'social')+count_recordings(data2018,'social')
totalsocial

In [None]:
totalforaging=count_recordings(data2015,'foraging')+count_recordings(data2016,'foraging')+count_recordings(data2017,'foraging')+count_recordings(data2018,'foraging')
totalforaging

In [None]:
totalfeeding=count_recordings(data2015,'Feeding')+count_recordings(data2016,'Feeding')+count_recordings(data2017,'Feeding')+count_recordings(data2018,'Feeding')
totalfeeding

In [None]:
# Data to plot
labels = 'social', 'foraging', 'Feeding', 'normal'
sizes = [totalsocial, totalforaging, totalfeeding, totalss-totalsocial-totalforaging-totalfeeding]
colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue']
explode = (0.1, 0, 0, 0)  # explode 1st slice
 
# Plot
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140,rotatelabels=True,labeldistance=1.3)
 
plt.axis('equal')
plt.show()
# social:0.3%  foraging:1.5%  Feeding:0.1%  normal:98.2% 

# Initial result for pulse clustering

In [None]:
pulses=remove_noise(t,freq)

In [None]:
type(pulses)
len(pulses)

In [None]:
# Find all the dy for a list of valid pulses
pulse_dy=list()
j=0
for pulse in pulses:
    i=0
    dy=list()
    for dot in pulse:
        
        if i==0:
            prev_dot=dot[1]
        else:
            dy.append(dot[1]-prev_dot)
            prev_dot=dot[1]
        i+=1
        
    pulse_dy.append(dy)
    

In [None]:
len(pulse_dy)

In [None]:
a=pd.DataFrame(pulse_dy[0]).describe()
a

In [None]:
# get mean, sd, and five-number summary features for pulses
def get_features(pulse_dy):
    features=pd.DataFrame()
    for dy in pulse_dy:
        a=pd.DataFrame(dy).describe().iloc[1:8,].T
        features=features.append(a)
    return features
    
    

In [None]:
b=get_features(pulse_dy)

In [None]:
b.iloc[8:11,]

In [None]:
# Clustering a sample recording pulses using KMeans
from sklearn.cluster import KMeans
est = KMeans(2)  # 4 clusters
# X=b.iloc[:,0:2]
X=b
est.fit(X)
y_kmeans = est.predict(X)
plt.figure(figsize=(10,8))
plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y_kmeans, s=50, cmap='rainbow');

In [None]:
# mask=y_kmeans==1

# from itertools import compress
# c=list(compress(pulses, mask))
# len(c)

y_kmeans

In [None]:
display_pulses(pulses,len(pulses),5,rand_flag=False, cluster=y_kmeans)