<h1><center>BirdCLEF-Birdcall Identification</center></h1>

# 1. Introduction

### Libraries 📚⬇

In [None]:
import os
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.image as mpimg
from matplotlib.offsetbox import AnnotationBbox, OffsetImage

# Map 1 library
import plotly.express as px

# Map 2 libraries
import descartes
import geopandas as gpd
from shapely.geometry import Point, Polygon

# Librosa Libraries
import librosa
import librosa.display
import IPython.display as ipd

import sklearn

import warnings
warnings.filterwarnings('ignore')

# 2. The .csv files 📁

> 📌**Note**:
* `train.csv` contains information about the audio files available in `train_audio`. It contains 62,874 datapoints in 14 unique columns.
* `test.csv` contains only 3 observations (the rest are available in the *hidden test set*).

In [None]:
# Import data
train_csv = pd.read_csv("../input/birdclef-2021/train_metadata.csv")
train_labels = pd.read_csv("../input/birdclef-2021/train_soundscape_labels.csv")

print("There are {:,} unique bird species in the dataset.".format(len(train_csv['common_name'].unique())))

In [None]:
train_csv.shape

### TEST.csv - let's take a look here as well before going further

> 📌**Note**:
* only 3 rows available (rest are in the hidden set)
* `site`: there are 3 sites in total, with first 2 having labeles every 5 seconds, while site_3 has labels at file level.
* `row_id`: this is the unique ID that will be used for the submission
* `seconds`: how long the clip is
* `audio_id`: `row_id` without site

*PS: "nocall" can be also one of the labels (hearing no bird).*

In [None]:
# Inspect text_csv before checking train data
test_csv = pd.read_csv('../input/birdclef-2021/test.csv')
test_csv.head()

## 2.1 Time of the Recording ⏰

> 📌**Note**: 
* `0000` is for the dates 0000-00-00, which are unknown
* `0202`, `0201`, `0199`, `2104` looks like an anomalous value

In [None]:
train_csv['year'] = train_csv['date'].apply(lambda x: x.split("-")[0])
train_csv['month'] = train_csv['date'].apply(lambda x: x.split("-")[1])

In [None]:
plt.figure(figsize=(16, 6))
ax = sns.countplot(train_csv['year'].sort_values(ascending=False), palette="hls")

plt.title("Audio Files Registration per Year Made", fontsize=16)
plt.xticks(rotation=90, fontsize=13)
plt.yticks(fontsize=13)
plt.ylabel("Frequency", fontsize=14)
plt.xlabel("");

In [None]:
plt.figure(figsize=(16, 6))
ax = sns.countplot(train_csv['month'].sort_values(ascending=False), palette="hls")

plt.title("Audio Files Registration per Month Made", fontsize=16)
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.ylabel("Frequency", fontsize=14)
plt.xlabel("");

## 2.2 The Songs

**Type Column**:

> 📌**Note**: This column is a bit messy, as the same description can be found under multiple names. Also, there can be multiple descriptions for multiple sounds (one bird song can mean a different thing from another one in the same recording). Some examples are:
* **begging call** is: `begging call`, `call`, `juvenile` 
* **male call** is: `chimp call`, `male`, `song` etc.

In [None]:
# Create a new variable type by exploding all the values
adjusted_type = train_csv['type'].apply(lambda x: x.split(',')).reset_index().explode("type")

# Strip of white spaces and convert to lower chars
adjusted_type = adjusted_type['type'].apply(lambda x: x.strip().lower()).reset_index()
adjusted_type['type'] = adjusted_type['type'].replace({'calls':'call'})
adjusted_type['type'] = adjusted_type['type'].str.replace(r"\W", "", regex=True)

# Create Top 10 list with song types
top_10 = list(adjusted_type['type'].value_counts().head(10).reset_index()['index'])
data = adjusted_type[adjusted_type['type'].isin(top_10)]

plt.figure(figsize=(16, 6))
ax = sns.countplot(data['type'], palette="hls", order = data['type'].value_counts().index)

plt.title("Top 10 Song Types", fontsize=16)
plt.ylabel("Frequency", fontsize=14)
plt.yticks(fontsize=13)
plt.xticks(rotation=45, fontsize=13)
plt.xlabel("");

# 3. The Audio Files

## 3.1 Listening to some Recordings

In [None]:
# Create Full Path so we can access data more easily
base_dir = '../input/birdclef-2021/train_short_audio'
train_csv['full_path'] = base_dir +  "/" + train_csv['primary_label'] + '/' + train_csv['filename']

# Now let's sample a fiew audio files
amered = train_csv[train_csv['primary_label'] == "btywar"].sample(1, random_state = 33)['full_path'].values[0]
cangoo = train_csv[train_csv['primary_label'] == "solsan"].sample(1, random_state = 33)['full_path'].values[0]
haiwoo = train_csv[train_csv['primary_label'] == "tenwar"].sample(1, random_state = 33)['full_path'].values[0]
pingro = train_csv[train_csv['primary_label'] == "hutvir"].sample(1, random_state = 33)['full_path'].values[0]
vesspa = train_csv[train_csv['primary_label'] == "wilsni1"].sample(1, random_state = 33)['full_path'].values[0]

bird_sample_list = ["btywar", "solsan", "tenwar", "hutvir", "wilsni1"]

## 3.4 Extracting Features from Sounds

> The audio data is composed by:
1. **Sound**: sequence of vibrations in varying pressure strengths (`y`)
2. **Sample Rate**: (`sr`) is the number of samples of audio carried per second, measured in Hz or kHz

In [None]:
# Importing 1 file
y, sr = librosa.load(amered)

print('y:', y, '\n')
print('y shape:', np.shape(y), '\n')
print('Sample Rate (KHz):', sr, '\n')

# Verify length of the audio
print('Check Len of Audio:', np.shape(y)[0]/sr)

In [None]:
# Trim leading and trailing silence from an audio signal (silence before and after the actual audio)
audio_file, _ = librosa.effects.trim(y)

# the result is an numpy ndarray
print('Audio File:', audio_file, '\n')
print('Audio File shape:', np.shape(audio_file))

In [None]:
# Importing the 5 files
y_amered, sr_amered = librosa.load(amered)
audio_amered, _ = librosa.effects.trim(y_amered)

y_cangoo, sr_cangoo = librosa.load(cangoo)
audio_cangoo, _ = librosa.effects.trim(y_cangoo)

y_haiwoo, sr_haiwoo = librosa.load(haiwoo)
audio_haiwoo, _ = librosa.effects.trim(y_haiwoo)

y_pingro, sr_pingro = librosa.load(pingro)
audio_pingro, _ = librosa.effects.trim(y_pingro)

y_vesspa, sr_vesspa = librosa.load(vesspa)
audio_vesspa, _ = librosa.effects.trim(y_vesspa)

### #1. Sound Waves (2D Representation)

In [None]:
fig, ax = plt.subplots(5, figsize = (16, 9))
fig.suptitle('Sound Waves', fontsize=16)

librosa.display.waveplot(y = audio_amered, sr = sr_amered, color = "#A300F9", ax=ax[0])
librosa.display.waveplot(y = audio_cangoo, sr = sr_cangoo, color = "#4300FF", ax=ax[1])
librosa.display.waveplot(y = audio_haiwoo, sr = sr_haiwoo, color = "#009DFF", ax=ax[2])
librosa.display.waveplot(y = audio_pingro, sr = sr_pingro, color = "#00FFB0", ax=ax[3])
librosa.display.waveplot(y = audio_vesspa, sr = sr_vesspa, color = "#D9FF00", ax=ax[4]);

for i, name in zip(range(5), bird_sample_list):
    ax[i].set_ylabel(name, fontsize=13)

### #2. Fourier Transform 🥁

> 📌**Note**: Function that gets a signal in the time domain as input, and outputs its decomposition into frequencies. Transform both the y-axis (frequency) to log scale, and the “color” axis (amplitude) to Decibels, which is approx. the log scale of amplitudes.

In [None]:
# Default FFT window size
n_fft = 2048 # FFT window size
hop_length = 512 # number audio of frames between STFT columns (looks like a good default)

# Short-time Fourier transform (STFT)
D_amered = np.abs(librosa.stft(audio_amered, n_fft = n_fft, hop_length = hop_length))
D_cangoo = np.abs(librosa.stft(audio_cangoo, n_fft = n_fft, hop_length = hop_length))
D_haiwoo = np.abs(librosa.stft(audio_haiwoo, n_fft = n_fft, hop_length = hop_length))
D_pingro = np.abs(librosa.stft(audio_pingro, n_fft = n_fft, hop_length = hop_length))
D_vesspa = np.abs(librosa.stft(audio_vesspa, n_fft = n_fft, hop_length = hop_length))

In [None]:
print('Shape of D object:', np.shape(D_amered))

In [None]:
fig, ax = plt.subplots(5, figsize = (16, 9))
fig.suptitle('Sound Waves', fontsize=16)

librosa.display.waveplot(y = D_amered, sr = sr_amered, color = "#A300F9", ax=ax[0])
librosa.display.waveplot(y = D_cangoo, sr = sr_cangoo, color = "#4300FF", ax=ax[1])
librosa.display.waveplot(y = D_haiwoo, sr = sr_haiwoo, color = "#009DFF", ax=ax[2])
librosa.display.waveplot(y = D_pingro, sr = sr_pingro, color = "#00FFB0", ax=ax[3])
librosa.display.waveplot(y = D_vesspa, sr = sr_vesspa, color = "#D9FF00", ax=ax[4]);

for i, name in zip(range(5), bird_sample_list):
    ax[i].set_ylabel(name, fontsize=13)

### #3. Spectrogram 🎷

> 📌**Note**: 
* What is a spectrogram? A spectrogram is a visual representation of the spectrum of frequencies of a signal as it varies with time. When applied to an audio signal, spectrograms are sometimes called sonographs, voiceprints, or voicegrams (wiki).
* Here we convert the frequency axis to a logarithmic one.

In [None]:
# Convert an amplitude spectrogram to Decibels-scaled spectrogram.
DB_amered = librosa.amplitude_to_db(D_amered, ref = np.max)
DB_cangoo = librosa.amplitude_to_db(D_cangoo, ref = np.max)
DB_haiwoo = librosa.amplitude_to_db(D_haiwoo, ref = np.max)
DB_pingro = librosa.amplitude_to_db(D_pingro, ref = np.max)
DB_vesspa = librosa.amplitude_to_db(D_vesspa, ref = np.max)

# === PLOT ===
fig, ax = plt.subplots(2, 3, figsize=(16, 9))
fig.suptitle('Spectrogram', fontsize=16)
fig.delaxes(ax[1, 2])

librosa.display.specshow(DB_amered, sr = sr_amered, hop_length = hop_length, x_axis = 'time', 
                         y_axis = 'log', cmap = 'cool', ax=ax[0, 0])
librosa.display.specshow(DB_cangoo, sr = sr_cangoo, hop_length = hop_length, x_axis = 'time', 
                         y_axis = 'log', cmap = 'cool', ax=ax[0, 1])
librosa.display.specshow(DB_haiwoo, sr = sr_haiwoo, hop_length = hop_length, x_axis = 'time', 
                         y_axis = 'log', cmap = 'cool', ax=ax[0, 2])
librosa.display.specshow(DB_pingro, sr = sr_pingro, hop_length = hop_length, x_axis = 'time', 
                         y_axis = 'log', cmap = 'cool', ax=ax[1, 0])
librosa.display.specshow(DB_vesspa, sr = sr_vesspa, hop_length = hop_length, x_axis = 'time', 
                         y_axis = 'log', cmap = 'cool', ax=ax[1, 1]);

for i, name in zip(range(0, 2*3), bird_sample_list):
    x = i // 3
    y = i % 3
    ax[x, y].set_title(name, fontsize=13) 

### #4. Mel Spectrogram 🎷
> 📌**Note**: The Mel Scale, mathematically speaking, is the result of some non-linear transformation of the frequency scale. The Mel Spectrogram is a normal Spectrogram, but with a Mel Scale on the y axis.

In [None]:
# Create the Mel Spectrograms
S_amered = librosa.feature.melspectrogram(y_amered, sr=sr_amered)
S_DB_amered = librosa.amplitude_to_db(S_amered, ref=np.max)

S_cangoo = librosa.feature.melspectrogram(y_cangoo, sr=sr_cangoo)
S_DB_cangoo = librosa.amplitude_to_db(S_cangoo, ref=np.max)

S_haiwoo = librosa.feature.melspectrogram(y_haiwoo, sr=sr_haiwoo)
S_DB_haiwoo = librosa.amplitude_to_db(S_haiwoo, ref=np.max)

S_pingro = librosa.feature.melspectrogram(y_pingro, sr=sr_pingro)
S_DB_pingro = librosa.amplitude_to_db(S_pingro, ref=np.max)

S_vesspa = librosa.feature.melspectrogram(y_vesspa, sr=sr_vesspa)
S_DB_vesspa = librosa.amplitude_to_db(S_vesspa, ref=np.max)

# === PLOT ====
fig, ax = plt.subplots(2, 3, figsize=(16, 9))
fig.suptitle('Mel Spectrogram', fontsize=16)
fig.delaxes(ax[1, 2])

librosa.display.specshow(S_DB_amered, sr = sr_amered, hop_length = hop_length, x_axis = 'time', 
                         y_axis = 'log', cmap = 'rainbow', ax=ax[0, 0])
librosa.display.specshow(S_DB_cangoo, sr = sr_cangoo, hop_length = hop_length, x_axis = 'time', 
                         y_axis = 'log', cmap = 'rainbow', ax=ax[0, 1])
librosa.display.specshow(S_DB_haiwoo, sr = sr_haiwoo, hop_length = hop_length, x_axis = 'time', 
                         y_axis = 'log', cmap = 'rainbow', ax=ax[0, 2])
librosa.display.specshow(S_DB_pingro, sr = sr_pingro, hop_length = hop_length, x_axis = 'time', 
                         y_axis = 'log', cmap = 'rainbow', ax=ax[1, 0])
librosa.display.specshow(S_DB_vesspa, sr = sr_vesspa, hop_length = hop_length, x_axis = 'time', 
                         y_axis = 'log', cmap = 'rainbow', ax=ax[1, 1]);

for i, name in zip(range(0, 2*3), bird_sample_list):
    x = i // 3
    y = i % 3
    ax[x, y].set_title(name, fontsize=13)

### #5. Zero Crossing Rate 🚷

> 📌**Note**: the rate at which the signal changes from positive to negative or back.

In [None]:
# Total zero_crossings in our 1 song
zero_amered = librosa.zero_crossings(audio_amered, pad=False)
zero_cangoo = librosa.zero_crossings(audio_cangoo, pad=False)
zero_haiwoo = librosa.zero_crossings(audio_haiwoo, pad=False)
zero_pingro = librosa.zero_crossings(audio_pingro, pad=False)
zero_vesspa = librosa.zero_crossings(audio_vesspa, pad=False)

zero_birds_list = [zero_amered, zero_cangoo, zero_haiwoo, zero_pingro, zero_vesspa]

for bird, name in zip(zero_birds_list, bird_sample_list):
    print("{} change rate is {:,}".format(name, sum(bird)))

### #6. Harmonics and Perceptrual 🎹

> 📌**Note**: 
* Harmonics are characteristichs that represent the sound *color*
* Perceptrual shock wave represents the sound *rhythm and emotion*

In [None]:
y_harm_haiwoo, y_perc_haiwoo = librosa.effects.hpss(audio_haiwoo)

plt.figure(figsize = (16, 6))
plt.plot(y_perc_haiwoo, color = '#FFB100')
plt.plot(y_harm_haiwoo, color = '#A300F9')
plt.legend(("Perceptrual", "Harmonics"))
plt.title("Harmonics and Perceptrual : Haiwoo Bird", fontsize=16);

### #7. Spectral Centroid 🎯

> 📌**Note**: 
Indicates where the ”centre of mass” for a sound is located and is calculated as the weighted mean of the frequencies present in the sound.

In [None]:
# Calculate the Spectral Centroids
spectral_centroids = librosa.feature.spectral_centroid(audio_cangoo, sr=sr)[0]

# Shape is a vector
print('Centroids:', spectral_centroids, '\n')
print('Shape of Spectral Centroids:', spectral_centroids.shape, '\n')

# Computing the time variable for visualization
frames = range(len(spectral_centroids))

# Converts frame counts to time (seconds)
t = librosa.frames_to_time(frames)

print('frames:', frames, '\n')
print('t:', t)

# Function that normalizes the Sound Data
def normalize(x, axis=0):
    return sklearn.preprocessing.minmax_scale(x, axis=axis)

In [None]:
#Plotting the Spectral Centroid along the waveform
plt.figure(figsize = (16, 6))
librosa.display.waveplot(audio_cangoo, sr=sr, alpha=0.4, color = '#A300F9', lw=3)
plt.plot(t, normalize(spectral_centroids), color='#FFB100', lw=2)
plt.legend(["Spectral Centroid", "Wave"])
plt.title("Spectral Centroid: Cangoo Bird", fontsize=16);

### #8. Chroma Frequencies

> 📌**Note**: Chroma features are an interesting and powerful representation for music audio in which the entire spectrum is projected onto 12 bins representing the 12 distinct semitones (or chromas) of the musical octave.

In [None]:
# Increase or decrease hop_length to change how granular you want your data to be
hop_length = 5000

# Chromogram Vesspa
chromagram = librosa.feature.chroma_stft(audio_vesspa, sr=sr_vesspa, hop_length=hop_length)
print('Chromogram Vesspa shape:', chromagram.shape)

plt.figure(figsize=(16, 6))
librosa.display.specshow(chromagram, x_axis='time', y_axis='chroma', hop_length=hop_length, cmap='twilight')

plt.title("Chromogram: Vesspa", fontsize=16);

### #9. Tempo BPM (beats per minute)🎤
> 📌**Note**: Dynamic programming beat tracker.

In [None]:
# Create Tempo BPM variable
tempo_amered, _ = librosa.beat.beat_track(y_amered, sr = sr_amered)
tempo_cangoo, _ = librosa.beat.beat_track(y_cangoo, sr = sr_cangoo)
tempo_haiwoo, _ = librosa.beat.beat_track(y_haiwoo, sr = sr_haiwoo)
tempo_pingro, _ = librosa.beat.beat_track(y_pingro, sr = sr_pingro)
tempo_vesspa, _ = librosa.beat.beat_track(y_vesspa, sr = sr_vesspa)

data = pd.DataFrame({"Type": bird_sample_list , 
                     "BPM": [tempo_amered, tempo_cangoo, tempo_haiwoo, tempo_pingro, tempo_vesspa] })

# Plot
plt.figure(figsize = (16, 6))
ax = sns.barplot(y = data["BPM"], x = data["Type"], palette="hls")

plt.ylabel("BPM", fontsize=14)
plt.yticks(fontsize=13)
plt.xticks(fontsize=13)
plt.xlabel("")
plt.title("BPM for 5 Different Bird Species", fontsize=16);

### #10. Spectral Rolloff 🥏
> 📌**Note**: Is a measure of the *shape of the signal*. It represents the frequency below which a specified percentage of the total spectral energy (e.g. 85%) lies.

In [None]:
# Spectral RollOff Vector
spectral_rolloff = librosa.feature.spectral_rolloff(audio_amered, sr=sr_amered)[0]

# Computing the time variable for visualization
frames = range(len(spectral_rolloff))
# Converts frame counts to time (seconds)
t = librosa.frames_to_time(frames)

# The plot
plt.figure(figsize = (16, 6))
librosa.display.waveplot(audio_amered, sr=sr_amered, alpha=0.4, color = '#A300F9', lw=3)
plt.plot(t, normalize(spectral_rolloff), color='#FFB100', lw=3)
plt.legend(["Spectral Rolloff", "Wave"])
plt.title("Spectral Rolloff: Btywar Bird", fontsize=16);