In [None]:
!nvidia-smi

In [None]:
!ls

In [None]:
!pip install -q pysndfx SoundFile audiomentations pretrainedmodels efficientnet_pytorch resnest

In [None]:
import numpy as np
import librosa as lb
import librosa.display as lbd
import soundfile as sf
from  soundfile import SoundFile
import pandas as pd
from  IPython.display import Audio
from pathlib import Path

import torch
from torch import nn, optim
from  torch.utils.data import Dataset, DataLoader

from resnest.torch import resnest50

from matplotlib import pyplot as plt

import os, random, gc
import re, time, json
from  ast import literal_eval


from IPython.display import Audio
from sklearn.metrics import label_ranking_average_precision_score

from tqdm.notebook import tqdm
import joblib

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [None]:
#Find paths using
#Path("../")

In [None]:
#Static Variables

NUM_CLASSES = 397
SR = 32_000
DURATION = 7

MAX_READ_SAMPLES = 5 # Each record will have 10 melspecs at most, you can increase this on Colab with High Memory Enabled

DATA_ROOT = Path("../input/birdclef-2021")
MEL_PATHS = sorted(Path("../input").glob("kkiller-birdclef-mels-computer-d7-part?/rich_train_metadata.csv"))
TRAIN_LABEL_PATHS = sorted(Path("../input").glob("kkiller-birdclef-mels-computer-d7-part?/LABEL_IDS.json"))

MODEL_ROOT = Path(".")

In [None]:
print(DATA_ROOT)

In [None]:
print(MEL_PATHS)

In [None]:
#MY
print(MEL_PATHS[0])
print(MEL_PATHS[1])
print(MEL_PATHS[2])
print(MEL_PATHS[3])

In [None]:
TRAIN_BATCH_SIZE = 100
TRAIN_NUM_WORKERS = 2

VAL_BATCH_SIZE = 128
VAL_NUM_WORKERS = 2

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Device:", DEVICE)

In [None]:
#MY
#convert secondary label string into list
df = pd.read_csv(str(MEL_PATHS[0]), index_col=0)
print(df.iloc[0,1])
print(type(df.iloc[0,1]))
df["secondary_labels"] = df["secondary_labels"].apply(literal_eval)
print(df.iloc[0,1])
print(type(df.iloc[0,1]))
print(df.iloc[0,1][0])

In [None]:
def get_df(mel_paths=MEL_PATHS, train_label_paths=TRAIN_LABEL_PATHS):
    df = None
    LABEL_IDS = {}
    
    for file_path in MEL_PATHS:
        temp = pd.read_csv(str(file_path), index_col=0)
        temp["impath"] = temp.apply(lambda row: file_path.parent/"audio_images/{}/{}.npy".format(row.primary_label, row.filename), axis=1) 
        df = temp if df is None else df.append(temp)
        
    for file_path in train_label_paths:
        with open(str(file_path)) as f:
          LABEL_IDS.update(json.load(f))

    return LABEL_IDS, df

In [None]:
#MY
LABEL_IDS_, df_ = get_df()

In [None]:
#MY
print(df_.shape)
print(type(LABEL_IDS_)) #contains all the labels in a dictionary
print(LABEL_IDS_.get("acafly"))

In [None]:
LABEL_IDS, df = get_df()

print(df.shape)
df.head()

In [None]:
df["primary_label"].value_counts()

In [None]:
df["secondary_labels"].value_counts()

In [None]:
df["label_id"].min(), df["label_id"].max()

# Model Need to Study

In [None]:
def get_model(name, num_classes=NUM_CLASSES):
    """
    Loads a pretrained model. 
    Supports ResNest, ResNext-wsl, EfficientNet, ResNext and ResNet.

    Arguments:
        name {str} -- Name of the model to load

    Keyword Arguments:
        num_classes {int} -- Number of classes to use (default: {1})

    Returns:
        torch model -- Pretrained model
    """
    if "resnest" in name:
        model = getattr(resnest_torch, name)(pretrained=True)
    elif "wsl" in name:
        model = torch.hub.load("facebookresearch/WSL-Images", name)
    elif name.startswith("resnext") or  name.startswith("resnet"):
        model = torch.hub.load("pytorch/vision:v0.6.0", name, pretrained=True)
    elif name.startswith("tf_efficientnet_b"):
        model = getattr(timm.models.efficientnet, name)(pretrained=True)
    elif "efficientnet-b" in name:
        model = EfficientNet.from_pretrained(name)
    else:
        model = pretrainedmodels.__dict__[name](pretrained='imagenet')

    if hasattr(model, "fc"):
        nb_ft = model.fc.in_features
        model.fc = nn.Linear(nb_ft, num_classes)
    elif hasattr(model, "_fc"):
        nb_ft = model._fc.in_features
        model._fc = nn.Linear(nb_ft, num_classes)
    elif hasattr(model, "classifier"):
        nb_ft = model.classifier.in_features
        model.classifier = nn.Linear(nb_ft, num_classes)
    elif hasattr(model, "last_linear"):
        nb_ft = model.last_linear.in_features
        model.last_linear = nn.Linear(nb_ft, num_classes)

    return model

In [None]:
def load_data(df):
    def load_row(row):
    # impath = TRAIN_IMAGES_ROOT/f"{row.primary_label}/{row.filename}.npy"
        return row.filename, np.load(str(row.impath))[:MAX_READ_SAMPLES]
    
    #in above fn: load the numpy files. Originally 3 dim (13???, 128, 281): ----> Using MAX_READ_SAMPLES we get set of values
    
    #joplib use to parallerlize processes

    pool = joblib.Parallel(4)
    mapper = joblib.delayed(load_row)
    tasks = [mapper(row) for row in df.itertuples(False)]
    res = pool(tqdm(tasks))
    res = dict(res)
    
    return res


In [None]:
#My
# Can get understanding about how to load the numpy files to dictionary
# Image dimentions and dimention reduction

i=1
for row in df.itertuples(False):
    print(row)
    print(type(row))
    
    def load_row(row):
    # impath = TRAIN_IMAGES_ROOT/f"{row.primary_label}/{row.filename}.npy"
        return row.filename, np.load(str(row.impath))
    a = load_row(row)
    print(type(a))
#     print(a)
    print(np.shape(a))
    print(a[0])
#     print(a[1])
    print(a[1].shape)
   
    if i==1:
        break
    else:
        pass

In [None]:
# We cache the train set to reduce training time

audio_image_store = load_data(df)
len(audio_image_store)

In [None]:
print("shape:", next(iter(audio_image_store.values())).shape)
lbd.specshow(next(iter(audio_image_store.values()))[0])

In [None]:
#My 
#images comparison by shape

for row in df.itertuples(False):
    
    def load_row(row):
    # impath = TRAIN_IMAGES_ROOT/f"{row.primary_label}/{row.filename}.npy"
        return row.filename, np.load(str(row.impath))
    a = load_row(row)
    print("shape:", next(iter(a[1].shape)))
    print(next(iter(a[1])))
    lbd.specshow(next(iter(a[1])))
    
    break

In [None]:
pd.Series([len(x) for x in audio_image_store.values()]).value_counts()

In [None]:
#my
audio_image_store['XC128813.ogg'].shape

#### Whats the reason for many different dimentions in the npy array (image) ????

### Dataset Wrapper

In [None]:
class BirdClefDataset(Dataset):

    def __init__(self, audio_image_store, meta, sr=SR, is_train=True, num_classes=NUM_CLASSES, duration=DURATION):
        
        self.audio_image_store = audio_image_store
        self.meta = meta
        self.sr = sr
        self.is_train = is_train
        self.num_classes = num_classes
        self.duration = duration
        self.audio_length = self.duration*self.sr
        
    #This type of method takes neither a self nor a cls parameter (but of course it’s free to accept an arbitrary number of other parameters).
    @staticmethod
    def normalize(image): 
        image = image.astype("float32",copy=False)/ 255.0
        #normalize image and create 3 stack of layers from one layer----> return (3,128,281)
        image = np.stack([image,image,image])
        return image
    
    def __len__(self):
        return len(self.meta)
    
    def __getitem__(self,idx):
        row = self.meta.iloc[idx]
        image = audio_image_store[row.filename]
        
        #gets one layer from available layers (randomly)
        image = image[np.random.choice(len(image))]
        image = self.normalize(image)
        
        #t is a (397,) numpy array with 0.0025 labels selected label will be 0.995?
        #why do we do label smoothing here ??????
        #Label Smoothing is a regularization technique that introduces noise for the labels. This accounts for the fact that datasets may have mistakes in them,
        t = np.zeros(self.num_classes, dtype=np.float32) + 0.0025 # Label smoothing
        t[row.label_id] = 0.995
        
        return image,t

In [None]:
ds = BirdClefDataset(audio_image_store, meta=df, sr=SR, duration=DURATION, is_train=True)
len(ds)

In [None]:
x, y = ds[np.random.choice(len(ds))]
# x, y = ds[0]
x.shape, y.shape, np.where(y >= 0.5)

In [None]:
lbd.specshow(x[0])

In [None]:
y[:5]

## Training the model