# 1. Introduction


In this competition the researchers from Cornell Lab of Ornithology’s Center for Conservation Bioacoustics (CBC) wants the Kaggle community to help them build an AI solution to identify bird species using their bird call audio.

<img src="https://images.unsplash.com/photo-1493236296276-d17357e28888?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&auto=format&fit=crop&w=1051&q=80" width="800"></img>

# 2. Analysis preparation

## 2.1. Load packages

Here we load the Python modules we will need for our analysis.

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
%matplotlib inline 
import IPython as ipy
import IPython.display as ipyd
import librosa
import librosa.display
import folium
from folium.plugins import HeatMap, HeatMapWithTime
import plotly.express as px
import sklearn
import warnings
warnings.filterwarnings(action='ignore')

## 2.2. Load the data

Here we load the metadata (csv file).

In [None]:
train_df = pd.read_csv("../input/birdsong-recognition/train.csv")


## 2.3. Glimpse the data

We perform a preliminary analysis of the data, looking to such things like data shape, missing data, unique values.

In [None]:
print(f"train data: {train_df.shape}")
print(f"train data columns: {list(train_df.columns)}")


In [None]:
pd.set_option('display.max_columns', 50)
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe()

## 2.4 Missing data

In [None]:
def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

In [None]:
missing_data(train_df)

## 2.5. Unique values

In [None]:
def unique_values(data):
    total = data.count()
    tt = pd.DataFrame(total)
    tt.columns = ['Total']
    uniques = []
    for col in data.columns:
        unique = data[col].nunique()
        uniques.append(unique)
    tt['Uniques'] = uniques
    return(np.transpose(tt))

In [None]:
unique_values(train_df)

# 3. Data exploration

We will explore the data, starting with the metadata information (csv file).

## 3.1. Features values distribution

In [None]:
def plot_count(feature, title, df, size=1):
    '''
    Plot count of classes / feature
    param: feature - the feature to analyze
    param: title - title to add to the graph
    param: df - dataframe from which we plot feature's classes distribution 
    param: size - default 1.
    '''
    f, ax = plt.subplots(1,1, figsize=(4*size,4))
    total = float(len(df))
    g = sns.countplot(df[feature], order = df[feature].value_counts().index[:20], palette='Set1')
    g.set_title("Number and percentage of {}".format(title))
    if(size > 2):
        plt.xticks(rotation=90, size=8)
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(100*height/total),
                ha="center") 
    plt.show()  

In [None]:
print(f"playback_used values: {train_df.playback_used.nunique()}")
plot_count("playback_used", "playback_used", train_df, size=1)

In [None]:
print(f"ebird_codes values: {train_df.ebird_code.nunique()}")
plot_count("ebird_code", "ebird_code (first 20 entries)", train_df, size=4)

In [None]:
print(f"channels values: {train_df.channels.nunique()}")
plot_count("channels", "channels", train_df, size=1)

In [None]:
print(f"pitch values: {train_df.pitch.nunique()}")
plot_count("pitch", "pitch", train_df, size=2)

In [None]:
print(f"speed values: {train_df.speed.nunique()}")
plot_count("speed", "speed", train_df, size=2)

In [None]:
print(f"species values: {train_df.species.nunique()}")
plot_count("species", "species (first 20)", train_df, size=4)

E-bird codes and Species seems to be corresponding values.

In [None]:
print(f"number of notes values: {train_df.number_of_notes.nunique()}")
plot_count("number_of_notes", "number_of_notes", train_df, size=2)

In [None]:
print(f"bird_seen values: {train_df.bird_seen.nunique()}")
plot_count("bird_seen", "bird_seen", train_df, size=1)

In [None]:
print(f"sci_name values: {train_df.sci_name.nunique()}")
plot_count("sci_name", "sci_name (first 20)", train_df, size=4)

In [None]:
print(f"location values: {train_df.location.nunique()}")
plot_count("location", "location (first 20)", train_df, size=4)

In [None]:
print(f"sampling_rate values: {train_df.sampling_rate.nunique()}")
plot_count("sampling_rate", "sampling_rate", train_df, size=3)

In [None]:
print(f"type values: {train_df.type.nunique()}")
plot_count("type", "type (first 20)", train_df, size=4)

In [None]:
print(f"elevation values: {train_df.elevation.nunique()}")
plot_count("elevation", "elevation (first 20)", train_df, size=4)

In [None]:
print(f"latitude values: {train_df.latitude.nunique()}")
plot_count("latitude", "latitude (first 20)", train_df, size=4)

In [None]:
print(f"longitude values: {train_df.longitude.nunique()}")
plot_count("longitude", "longitude (first 20)", train_df, size=4)

Latitude, longitude, elevation can be used to build a map with the observation location and altitude.

In [None]:
print(f"bitrate_of_mp3 values: {train_df.bitrate_of_mp3.nunique()}")
plot_count("bitrate_of_mp3", "bitrate_of_mp3 (first 20)", train_df, size=4)

In [None]:
print(f"volume values: {train_df.volume.nunique()}")
plot_count("volume", "volume", train_df, size=2)

In [None]:
print(f"file_type values: {train_df.file_type.nunique()}")
plot_count("file_type", "file_type", train_df, size=2)

In [None]:
print(f"background values: {train_df.background.nunique()}")
plot_count("background", "background (first 20)", train_df, size=4)

Background is given in name of the species and (in paranthesys) the scientific name.

In [None]:
print(f"author values: {train_df.author.nunique()}")
plot_count("author", "author (first 20)", train_df, size=4)

In [None]:
print(f"primary_label values: {train_df.primary_label.nunique()}")
plot_count("primary_label", "primary_label (first 20)", train_df, size=4)

In [None]:
print(f"length values: {train_df.length.nunique()}")
plot_count("length", "length", train_df, size=2)

In [None]:
print(f"time values: {train_df.time.nunique()}")
plot_count("time", "time (first 20)", train_df, size=4)

In [None]:
print(f"country values: {train_df.country.nunique()}")
plot_count("country", "country (first 20)", train_df, size=4)

In [None]:
print(f"recordist values: {train_df.recordist.nunique()}")
plot_count("recordist", "recordist (first 20)", train_df, size=4)

In [None]:
print(f"license values: {train_df.license.nunique()}")
plot_count("license", "license", train_df, size=3)

## 3.2. Geographical distribution

Let's look now to the geographical distribution of data. We will group on latitude and longitude and count the occurences for each {latitude, longitude} tuple.
Next, we will represent this geographical distribution with a heatmap, the intensity of color being proportional with the number of data.

In [None]:
tmp = train_df.groupby(['latitude', 'longitude'])['url'].count()
latlong_df = pd.DataFrame(tmp).reset_index()
latlong_df.columns = ['latitude', 'longitude', 'count']
latlong_df.tail()

In [None]:
latlong_df = latlong_df.loc[~(latlong_df.latitude=="Not specified")]

In [None]:
m = folium.Map(location=[0,0], zoom_start=2)
max_val = max(latlong_df['count'])
HeatMap(data=latlong_df[['latitude', 'longitude', 'count']],\
        radius=15, max_zoom=12).add_to(m)
m

We also can group the data on countries.

In [None]:
tmp = train_df.groupby(['country'])['url'].count()
country_df = pd.DataFrame(tmp).reset_index()
country_df.columns = ['country','count']
df = px.data.gapminder().query("year==2007")
df = df[['country', 'iso_alpha']]
country_df = country_df.merge(df, on="country")
country_df.head()

In [None]:
hover_text = []
for index, row in country_df.iterrows():
    hover_text.append((f"country: {row['country']}<br>count: {row['count']}<br>country code: {row['iso_alpha']}"))
country_df['hover_text'] = hover_text

fig = px.choropleth(country_df, 
                    locations="iso_alpha",
                    hover_name='hover_text',
                    color="count",
                     projection="natural earth",
                    color_continuous_scale=px.colors.sequential.Plasma,
                    width=700, height=525)
fig.update_geos(   
    showcoastlines=True, coastlinecolor="DarkBlue",
    showland=True, landcolor="LightGrey",
    showocean=True, oceancolor="LightBlue",
    showlakes=True, lakecolor="Blue",
    showrivers=True, rivercolor="Blue",
    showcountries=True, countrycolor="DarkBlue"
)
fig.update_layout(title = 'Number of observations per country<br>(hover for details)')
fig.show()

## 3.3. Time and location distribution

In [None]:
train_df['dated'] = pd.to_datetime(train_df['date'], format='%Y-%m-%d', errors='coerce')

In [None]:
train_df['year'] = train_df['dated'].dt.year
train_df['month'] = train_df['dated'].dt.month
train_df['day'] = train_df['dated'].dt.day
train_df['dayofweek'] = train_df['dated'].dt.dayofweek

In [None]:
def plot_time_variation(df, x='date', y='count', hue=None, size=1, is_log=False):
    f, ax = plt.subplots(1,1, figsize=(4*size,3*size))
    g = sns.lineplot(x=x, y=y, hue=hue, data=df)
    plt.xticks(rotation=90)
    if hue:
        plt.title(f'{y} grouped by {hue}')
    else:
        plt.title(f'{y}')
    if(is_log):
        ax.set(yscale="log")
    ax.grid(color='black', linestyle='dotted', linewidth=0.75)
    plt.show() 

In [None]:
agg_df = train_df.groupby(['year'])['url'].count().reset_index()
agg_df.columns = ['year', 'count']

In [None]:
plot_time_variation(agg_df, x='year', y="count", hue=None, size=4)

In [None]:
train_df.columns

In [None]:
agg_df = train_df.groupby(['year', 'bird_seen'])['url'].count().reset_index()
agg_df.columns = ['year', 'bird_seen', 'count']
plot_time_variation(agg_df, x='year', y="count", hue='bird_seen', size=4, is_log=True)

In [None]:
agg_df = train_df.groupby(['year', 'playback_used'])['url'].count().reset_index()
agg_df.columns = ['year', 'playback_used', 'count']
plot_time_variation(agg_df, x='year', y="count", hue='playback_used', size=4, is_log=True)

In [None]:
agg_df = train_df.groupby(['year', 'license'])['url'].count().reset_index()
agg_df.columns = ['year', 'license', 'count']
plot_time_variation(agg_df, x='year', y="count", hue='license', size=4, is_log=True)

In [None]:
print(f"year values: {train_df.year.nunique()}")
plot_count("year", "year", train_df, size=4)

In [None]:
print(f"month values: {train_df.month.nunique()}")
plot_count("month", "month", train_df, size=3)

Going out to record birdsongs happens mostly in May and Junr, when more than 40% of all records were made.

In [None]:
print(f"day values: {train_df.day.nunique()}")
plot_count("day", "day", train_df, size=4)

In [None]:
print(f"dayofweek values: {train_df.dayofweek.nunique()}")
plot_count("dayofweek", "dayofweek", train_df, size=3)

It looks like recording birdsongs is mainly a weekend activity (which makes sense, since most of the recorders are volunteers), since most of the recording are on Saturdays & Sundays.

# Signal data exploration

Let's explore now the signal data from the training set.

In [None]:
TRAIN_AUDIO_PATH = "../input/birdsong-recognition/train_audio/"
files = os.listdir(TRAIN_AUDIO_PATH)
print(f"train folders: {len(files)}")
print(f"some ebird_code examples: {files[0:10]}")

## Play audio

Let's listen to some of the audio signals.

In [None]:
def play_audio_file(ebird_code, samples=3):
    for sample in range(0, samples):
        file_name = train_df.loc[train_df.ebird_code==ebird_code, "filename"].values[sample]
        length = train_df.loc[train_df.ebird_code==ebird_code, "length"].values[sample]
        file_type = train_df.loc[train_df.ebird_code==ebird_code, "file_type"].values[sample]
        volume = train_df.loc[train_df.ebird_code==ebird_code, "volume"].values[sample]
        bitrate_of_mp3 = train_df.loc[train_df.ebird_code==ebird_code, "bitrate_of_mp3"].values[sample]
        audio_file_path = os.path.join(TRAIN_AUDIO_PATH, ebird_code, file_name)
        print(f"ebird_code: {ebird_code} file: {file_name}\nlength: {length}\nvolume: {volume}\nbit rate: {bitrate_of_mp3}\nfile type: {file_type}")
        ipy.display.display(ipyd.Audio(audio_file_path))

In [None]:
play_audio_file("aldfly", 2)

In [None]:
play_audio_file("purfin", 2)

In [None]:
play_audio_file("marwre", 2)

In [None]:
play_audio_file("boboli", 2)

In [None]:
play_audio_file("wewpew", 2)

In [None]:
play_audio_file("eawpew", 2)

## Analyze signals


### Signal plots

Let's plot some of the signals in time.

We create a function that samples few signals from a certain species and display it.

In [None]:
def plot_audio_file(ebird_code):

    plt.figure(figsize=(16,6))
    sample = 0
    file_name = train_df.loc[train_df.ebird_code==ebird_code, "filename"].values[sample]
    length = train_df.loc[train_df.ebird_code==ebird_code, "length"].values[sample]
    file_type = train_df.loc[train_df.ebird_code==ebird_code, "file_type"].values[sample]
    volume = train_df.loc[train_df.ebird_code==ebird_code, "volume"].values[sample]
    bitrate_of_mp3 = train_df.loc[train_df.ebird_code==ebird_code, "bitrate_of_mp3"].values[sample]
    audio_file_path = os.path.join(TRAIN_AUDIO_PATH, ebird_code, file_name)
    x , sr = librosa.load(audio_file_path)
    librosa.display.waveplot(x, sr=sr)
    plt.gca().set_title(f"ebird_code: {ebird_code} file: {file_name}\nlength: {length} volume: {volume} bit rate: {bitrate_of_mp3} file type: {file_type}")
    plt.show()

In [None]:
plot_audio_file("aldfly")

In [None]:
plot_audio_file("purfin")

In [None]:
plot_audio_file("marwre")

In [None]:
plot_audio_file("brebla")

In [None]:
plot_audio_file("boboli")

In [None]:
plot_audio_file("wewpew")

In [None]:
plot_audio_file("eawpew")

### Signal spectrogram

Let's also plot some signal spectrograms. A spectrogram is a visual representation of the spectre of frequencies associated with a signal.


In [None]:
def plot_audio_file_spectrogram(ebird_code):

    plt.figure(figsize=(16,6))
    sample = 0
    file_name = train_df.loc[train_df.ebird_code==ebird_code, "filename"].values[sample]
    length = train_df.loc[train_df.ebird_code==ebird_code, "length"].values[sample]
    file_type = train_df.loc[train_df.ebird_code==ebird_code, "file_type"].values[sample]
    volume = train_df.loc[train_df.ebird_code==ebird_code, "volume"].values[sample]
    bitrate_of_mp3 = train_df.loc[train_df.ebird_code==ebird_code, "bitrate_of_mp3"].values[sample]
    audio_file_path = os.path.join(TRAIN_AUDIO_PATH, ebird_code, file_name)
    x , sr = librosa.load(audio_file_path)
    xs = librosa.stft(x)
    xdb = librosa.amplitude_to_db(abs(xs))
    librosa.display.specshow(xdb, sr=sr, x_axis='time', y_axis='hz')
    plt.gca().set_title(f"Spectrogram - ebird_code: {ebird_code} file: {file_name}\nlength: {length} volume: {volume} bit rate: {bitrate_of_mp3} file type: {file_type}")
    plt.colorbar()

In [None]:
plot_audio_file_spectrogram("aldfly")

In [None]:
plot_audio_file_spectrogram("purfin")

In [None]:
plot_audio_file_spectrogram("marwre")

In [None]:
plot_audio_file_spectrogram("brebla")

In [None]:
plot_audio_file_spectrogram("boboli")

In [None]:
plot_audio_file_spectrogram("wewpew")

In [None]:
plot_audio_file_spectrogram("eawpew")

### Signal spectral rolloff

The spectral rolloff is a measure of the shape of the signal, representing the frequency at which high frequencies decline to 0. Can be calculated by the fraction of bins in the power spectrum where 85% of its power is at lower frequencies.

In [None]:
def normalize(x, axis=0):
    return sklearn.preprocessing.minmax_scale(x, axis=axis)

def plot_audio_file_spectral_rolloff(ebird_code):

    plt.figure(figsize=(16,6))
    sample = 0
    file_name = train_df.loc[train_df.ebird_code==ebird_code, "filename"].values[sample]
    length = train_df.loc[train_df.ebird_code==ebird_code, "length"].values[sample]
    file_type = train_df.loc[train_df.ebird_code==ebird_code, "file_type"].values[sample]
    volume = train_df.loc[train_df.ebird_code==ebird_code, "volume"].values[sample]
    bitrate_of_mp3 = train_df.loc[train_df.ebird_code==ebird_code, "bitrate_of_mp3"].values[sample]
    audio_file_path = os.path.join(TRAIN_AUDIO_PATH, ebird_code, file_name)
    x , sr = librosa.load(audio_file_path)
    spectral_rolloff = librosa.feature.spectral_rolloff(x+0.01, sr=sr)[0]
    spectral_centroids = librosa.feature.spectral_centroid(x, sr=sr)[0]
    frames = range(len(spectral_centroids))
    t = librosa.frames_to_time(frames)
    librosa.display.waveplot(x, sr=sr, alpha=0.4)
    plt.gca().set_title(f"Spectral rolloff - ebird_code: {ebird_code} file: {file_name}\nlength: {length} volume: {volume} bit rate: {bitrate_of_mp3} file type: {file_type}")
    plt.plot(t, normalize(spectral_rolloff), color='r')

In [None]:
plot_audio_file_spectral_rolloff("aldfly")

In [None]:
plot_audio_file_spectral_rolloff("purfin")

In [None]:
plot_audio_file_spectral_rolloff("marwre")

In [None]:
plot_audio_file_spectral_rolloff("brebla")

In [None]:
plot_audio_file_spectral_rolloff("boboli")

In [None]:
plot_audio_file_spectral_rolloff("wewpew")

In [None]:
plot_audio_file_spectral_rolloff("eawpew")