In [None]:
from pathlib import Path
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

try:
  from opensoundscape.torch.models.cnn import CNN
  from opensoundscape.torch.models.utils import BaseModule
  from opensoundscape.torch.architectures import cnn_architectures
except:
  !pip install opensoundscape==0.7.1

from matplotlib import pyplot as plt

In [None]:
# Path for cattle presence metadata
cattle_pres_metadata_path = '/path/to/directory/containing/cattle_pres/metadata.csv'

# Path for cattle absence metadata
cattle_abs_metadata_path = '/path/to/directory/containing/cattle_abs/metadata.csv'

# Path for directory containing cattle presence audio clips
cattle_pres_clips = '/path/to/directory/containing/cattle_pres/clips'

# Path for directory containing cattle absence clips
cattle_pres_clips = '/path/to/directory/containing/cattle_abs/metadata'

# Path for csv file to write validation results to
validation_results_csv_path = '/path/to/validation/results.csv'

Create a dataframe for the data in both the csv files with one-hot encoding for presence and absence of cattle.

In [None]:
# Create dataframe from cattle_pres.csv
cattle_pres_df = pd.read_csv('/path/to/directory/containing/cattle_pres/metadata.csv', usecols = ['filename'], nrows=None)
# Replace all '/' w/ '_' so that the records under 'filename' are same as names of audio files
cattle_pres_df['filename'] = cattle_pres_df['filename'].str.replace('/','_')
# File names should carry complete path of the audio files so that they can be downloaded (and converted to spectrograms, etc.)
cattle_pres_df.filename = ['/path/to/directory/containing/cattle_pres/clips/'+f for f in cattle_pres_df.filename]
# Insert features, 'present' and 'absent', and fill them w/ 1 & 0 respectively
cattle_pres_df.insert(1, 'present', 1)
cattle_pres_df.insert(1, 'absent', 0)

# Repeat the above procedure starting w/ cattle_abs.csv
cattle_abs_df = pd.read_csv('/path/to/directory/containing/cattle_abs/metadata.csv', usecols = ['filename'], nrows=None)
cattle_abs_df['filename'] = cattle_abs_df['filename'].str.replace('/','_')
cattle_abs_df.filename = ['/path/to/directory/containing/cattle_abs/clips'+f for f in cattle_abs_df.filename]
cattle_abs_df.insert(1, 'present', 0)
cattle_abs_df.insert(1, 'absent', 1)

# Concatenate the two dataframes
cattle_onehot_df = pd.concat([cattle_pres_df, cattle_abs_df])
# Convert filename feature to index. The dataframe so obtained is in 'one-hot form' just like in the tutorial.
cattle_onehot_df = cattle_onehot_df.set_index('filename')

Alternatively, can create one-hot vector using OneHotEncoder from sklearn.

In [None]:
# Create dataframes for cattle presence. Insert a feature 'cattle' in the dataframe and populate w/ 'present'
cattle_pres_df = pd.read_csv('/path/to/directory/containing/cattle_pres/metadata.csv', usecols = ['filename'], nrows = None)
cattle_pres_df['filename'] = cattle_pres_df['filename'].str.replace('/','_')
cattle_pres_df.filename = ['/path/to/directory/containing/cattle_pres/clips/'+f for f in cattle_pres_df.filename]
cattle_pres_df.insert(1, 'cattle', 'present')

# Repeat for cattle absence
cattle_abs_df = pd.read_csv('/path/to/directory/containing/cattle_abs/metadata.csv', usecols = ['filename'], nrows = None)
cattle_abs_df['filename'] = cattle_abs_df['filename'].str.replace('/','_')
cattle_abs_df.filename = ['/path/to/directory/containing/cattle_abs/clips'+f for f in cattle_abs_df.filename]
cattle_abs_df.insert(1, 'cattle', 'absent')

# Merge the two dataframes created above
cattle_df = pd.concat([cattle_pres_df, cattle_abs_df], ignore_index=True)

# One-hot encoding

enc = OneHotEncoder()  # Instantiating OneHotEncoder

# Use 'cattle' feature to create labels
# Step 1: enc.fit_transform(cattle_df[['cattle']])
# Step 2: enc.fit_transform(cattle_df[['cattle']]).toarray()
# Step 3: pd.DataFrame(enc.fit_transform(cattle_df[['cattle']]).toarray())
# Step 4:
cattle_onehot_df = cattle_df.join(pd.DataFrame(enc.fit_transform(cattle_df[['cattle']]).toarray()))
# Drop 'cattle' and rename the labels
cattle_onehot_df.drop('cattle', inplace=True, axis=1)
cattle_onehot_df.rename(columns={0:'absent', 1:'present'}, inplace=True)

cattle_onehot_df = cattle_onehot_df.set_index('filename')


Create training and holdout sets.

In [None]:
# Split the data into two sets - for training and validation - using train_test_split() method from sklearn

from sklearn.model_selection import train_test_split
train_onehot_df, valid_onehot_df = train_test_split(cattle_onehot_df, test_size=.2, random_state=1)

The following code cell is only meant to create an empty csv file w/ specified coloumns, at specified path, and need not be executed more than once.

In [None]:
# Creating a dataframe for trained model's results on validation set.
validation_results_df = pd.DataFrame\
(columns=['# records_cattle pres (train+valid)', '# records_cattle abs (train+valid)',\
          'N_epochs', 'Batch size', 'Epoch_best model', 'F1 score_best model', 'Training time'])
# Create csv file at the specified location using the above dataframe
validation_results_df.to_csv('/path/to/validation/results.csv')


Training the model w/ training set and writing validation results to the csv file.

In [None]:
# Use ResNet18 architecture and instantiate CNN class
arch = cnn_architectures.resnet18(num_classes=2, use_pretrained=True,\
                                  freeze_feature_extractor=True, num_channels=3)
                                  # using ImageNet weights and freezing the feature extractor
b_classifier = CNN(architecture=arch, classes=train_onehot_df.columns, sample_duration=10.0, single_target=True)

# create a dataframe from validation results csv
validation_results_df = pd.read_csv('/path/to/validation/results.csv', index_col=0)

epochs = 100
batch_size = 512

# to record training time
start_time = datetime.now()

# call train()
b_classifier.train(
    train_onehot_df,
    valid_onehot_df,
    save_path='./binary_train/',
    epochs=epochs,
    batch_size=batch_size,
    save_interval=1,
    num_workers=0)
end_time = datetime.now()

# append the results to dataframe and write the dataframe back to the csv file
validation_results_df = \
validation_results_df.append\
({'# records_cattle pres (train+valid)': len(cattle_onehot_df[cattle_onehot_df['present']==1]),\
  '# records_cattle abs (train+valid)': len(cattle_onehot_df[cattle_onehot_df['absent']==1]),\
  'N_epochs': epochs, 'Batch size': batch_size, 'Epoch_best model': b_classifier.best_epoch,\
  'F1 score_best model': b_classifier.best_score, 'Training time': end_time - start_time},\
 ignore_index=True)
validation_results_df.to_csv('/path/to/validation/results.csv')


Loss curve

In [None]:
plt.scatter(b_classifier.loss_hist.keys(),b_classifier.loss_hist.values())
plt.xlabel('epoch')
plt.ylabel('loss')