![](https://drive.google.com/uc?id=1FcuhEghc6BeJYDTbghEExQE3y3jlSZD-)



References :

https://www.kaggle.com/code/susnato/birdclef-2022-eda-resnet50-training-tf

https://www.kaggle.com/code/spsayakpaul/mae-keras


In [None]:
import os
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import StratifiedKFold
import cv2
import matplotlib.pyplot as plt
from math import ceil
import tensorflow as tf

import warnings
warnings.filterwarnings("ignore")

import wandb

import re
import gc
import glob
import random
import seaborn as sns
import plotly.graph_objects as go
%matplotlib inline

import IPython.display as ipd

# Map libraries
import descartes
import geopandas as gpd
from shapely.geometry import Point, Polygon


# Audio specific imports
import librosa as lb
import librosa.display

<img src="https://camo.githubusercontent.com/dd842f7b0be57140e68b2ab9cb007992acd131c48284eaf6b1aca758bfea358b/68747470733a2f2f692e696d6775722e636f6d2f52557469567a482e706e67">

> I will be integrating W&B for visualizations and logging artifacts!
> 
> [Birdclef 2022 project on W&B Dashboard](https://wandb.ai/usharengaraju/BirdClef2022)
> 
> - To get the API key, create an account in the [website](https://wandb.ai/site) .
> - Use secrets to use API Keys more securely

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wandb_key = user_secrets.get_secret("api_key")
wandb.login(key = wandb_key)

In [None]:
METADATA_FILE_PATH = '../input/birdclef-2022/train_metadata.csv'
metadata_df = pd.read_csv(METADATA_FILE_PATH)
metadata_df.head(5)

# **<span style="color:#F7B2B0;">Top 20 Primary Labels</span>**

In [None]:
species = metadata_df['primary_label'].value_counts().head(20)
fig, ax = plt.subplots(figsize=(15, 7))
sns.barplot(x=species.values,y=species.index,data = species,palette = "copper") 
plt.show()

In [None]:
plt.figure(figsize=(16, 6))
ax = sns.countplot(x = metadata_df['rating'], order = metadata_df['rating'].value_counts().index , palette = "copper")

plt.title("Sound quality rating", fontsize=16)
plt.ylabel("Count", fontsize=14)
plt.yticks(fontsize=13)
plt.xticks(rotation=45, fontsize=13)
plt.xlabel("");

In [None]:
train_audio = '../input/birdclef-2022/train_audio/'
print(f'Number of unique bird species: {len(os.listdir(train_audio))}')

# **<span style="color:#F7B2B0;">Visualize number of recordings per Label using W&B</span>**

W&B Tables are used to log and visualize data and model predictions. 

📌Interactively explore your data

📌Compare changes precisely across models, epochs, or individual examples

📌Understand higher-level patterns in your data

📌Capture and communicate your insights with visual samples

In [None]:
recordings_per_label = {'species_id': [], 
                        'num_audio': []}

for label in os.listdir(train_audio):
    num_recordings = len(os.listdir(train_audio+label))
    recordings_per_label['species_id'].append(label)
    recordings_per_label['num_audio'].append(num_recordings)
        
recordings_per_label = pd.DataFrame.from_dict(recordings_per_label)

run = wandb.init(project='BirdClef2022', group='EDA')
data = [[label, val] for (val, label) in sorted(zip(recordings_per_label.num_audio.values, recordings_per_label.species_id.values))[::-1]]
table = wandb.Table(data=data, columns = ["species_id", "num_audio"])
wandb.log({"recordings_per_label" : wandb.plot.bar(table, "species_id", "num_audio",
                               title="Number of recordings per label")})



wandb.finish()

![](https://i.imgur.com/mlfAY15.png)

In [None]:
def configure_device():
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()  # connect to tpu cluster
        strategy = tf.distribute.TPUStrategy(tpu) # get strategy for tpu
        print('Num of TPUs: ', strategy.num_replicas_in_sync)
        device='TPU'
    except: # otherwise detect GPUs
        tpu = None
        gpus = tf.config.list_logical_devices('GPU') # get logical gpus
        ngpu = len(gpus)
        if ngpu: # if number of GPUs are 0 then CPU
            strategy = tf.distribute.MirroredStrategy(gpus) # single-GPU or multi-GPU
            print("> Running on GPU", end=' | ')
            print("Num of GPUs: ", ngpu)
            device='GPU'
        else:
            print("> Running on CPU")
            strategy = tf.distribute.get_strategy() # connect to single gpu or cpu
            device='CPU'
    return strategy, device, tpu

In [None]:
strategy, device, tpu = configure_device()
AUTO = tf.data.experimental.AUTOTUNE


In [None]:
import re
def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

GCS_PATH = "gs://kds-cd6394f23429c6b928662c3e4c9479f1b4e4371b159e5633d5a79b6d"
ALL_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/*.tfrec')
print('NUM TFRECORD FILES: {:,}'.format(len(ALL_FILENAMES)))


In [None]:
# tf.config.run_functions_eagerly(True)
def parse_tfr_element(element):
  #use the same structure as above; it's kinda an outline of the structure we now want to create
  data = {
      'filename': tf.io.FixedLenFeature([], tf.string),
      'time':tf.io.FixedLenFeature([], tf.int64),
      'audio' : tf.io.FixedLenFeature([], tf.string),
      'label':tf.io.FixedLenFeature([], tf.int64),
    }

    
  content = tf.io.parse_single_example(element, data)
  
  filename = content['filename']

  time = content['time']
  label = content['label']
  audio = content['audio']

  return (audio, label)

dataset = tf.data.TFRecordDataset(ALL_FILENAMES)
#pass every single feature through our mapping function
dataset=dataset.shuffle(75000)
dataset = dataset.map(parse_tfr_element)
# dataset = dataset.batch(10)

In [None]:
from tqdm import tqdm
train_x=[]
train_y=[]

for sample in tqdm(dataset.take(200)):
  x = np.fromstring(sample[0].numpy(), dtype='uint8')
  image = cv2.imdecode(x, cv2.IMREAD_UNCHANGED)
  image = cv2.resize(image,(128,48))
  train_x.append(image)
  train_y.append(sample[1].numpy())

In [None]:
# train_X = np.asarray(train_x)
train_Y = np.asarray(train_y)
del train_y

In [None]:
train_Y = train_Y.ravel()

In [None]:
np.unique(train_Y).reshape(-1, 1).shape

In [None]:
jg = [[float(i)] for i in range(151)]
jg=np.array(jg)
jg.shape

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
ohe.fit(jg)
Y_vals=[]
for y in tqdm(train_Y):
  res = ohe.transform(np.array([y]).reshape(-1, 1)).todense()
  y_arr = np.array(res).reshape(-1, 151)
  Y_vals.extend(y_arr)
Y_vals=np.array(Y_vals)
print(Y_vals.shape)
del train_Y

train_x_placeholder=[]
train_y = []
for i in range(len(train_x)):
  try:
    assert train_x[i].shape == (48, 128, 3)
    train_x_placeholder.append(train_x[i])
    train_y.append(Y_vals[i])
  except:
    pass
del Y_vals

In [None]:
train_x_placeholder = tf.convert_to_tensor(train_x_placeholder)
train_y = tf.convert_to_tensor(train_y)

The tutorial aims to explain the concepts and terminologies of the research paper " Masked Autoencoders Are Scalable Vision Learners " .




## **<span style="color:#e76f51;">Context</span>**

`MAE (Masked autoencoders)` are self-supervised models used to reconstruct the image with missing pixels. It has two core designs consisting of encoder-decoder and masking(hiding) of pixels. Firstly for training purposes, up to 75% of the tokens are masked and removed and the remaining patch is encoded. Since the output of an autoencoder has the same number of tokens as input, the masked tokens are inserted again and with the lightweight decoder, the input image is reconstructed. After pre-training, the decoder is discarded and the encoder is applied to complete images(no missing pixels) for recognition tasks. The above model allows us to train the data faster and improve accuracy. This model works well on a variety of image cases and outperforms supervised training and also scales effectively.

The transformer architecture has been successfully applied to Natural Language Processing(NLP) using autoregressive language modelling and masked encoding, wherein a portion of the data is removed and models are trained to predict the missing data. However, computer vision has been predominantly associated with Convolutional Neural Network(CNN). The paper explores the usage of masked autoencoders in computer vision

Previously, autoencoding wasn’t used in computer vision due to the following reasons-

📌 It was difficult to integrate masked tokens or positional embedding into CNN. However with Vision Transformers(ViT), this problem was solved. ViT slightly outperforms CNN for a large dataset(more than 100 million images). In ViT, we split the image into fixed size patches, vectorize them, add positional embedding and feed the vectors into a transformer encoder to train the model. 

📌 Languages are information-dense and predicting missing words is a sophisticated task which needs sophisticated language understanding. While missing patches of images can be recreated with little high-level understanding. To overcome this,large proportion of patches of image are masked(removed) , reducing redundancy and requiring a higher level of understanding.

📌 Decoder plays different roles in reconstructing images and text. As text has a high level of semantic information(information that refers to facts, concepts and ideas which we have accumulated over the course of our lives) while images have a low level of semantic information. Thus the decoder design plays an important role for reconstructing images.

The paper presents  a simple, effective and scalable form of  MAE (Masked Autoenocoder) for visual representation learning. In MAE, random patches from the input space are masked and these random patches are reconstructed in pixel space. It has an asymmetric encoder-decoder design. The encoder operates on tokens which remain after removal of masked tokens and a lightweight decoder reconstructs the image from the latent representation and masked tokens. With a high masking ratio (75%), high accuracy can be achieved, reducing the overall training time by more than 3x and also reducing memory consumption. MAE pre-training helps data-hungry modela like ViT-Huge to improve their performance. The paper also evaluates transfer learning on a variety of downstream tasks such as object detection, instance segmentation, and semantic segmentation. In these tasks, the proposed pre-trained model achieves better results than supervised pre-trained models.



## **<span style="color:#e76f51;">Masked language modelling</span>**

`Masked language modelling` is successful for pre-training in NLP. In methods such as BERT and GPT(methods used for pre-training in NLP), the sequences of words from the input were removed and the model is trained to predict the missing sequence. These models scale efficiently and work for a variety of downstream tasks .

## **<span style="color:#e76f51;">Autoencoding</span>**

`Autoencoding` type of neural network that is trained to copy its input to its output. It has an encoder that converts input vector into code vector(latent representation) using recognition weights and a decoder that regenerates the input from the code vector using generative weights. Denoising autoencoders(DAE) are used to corrupt an input signal and predict the original signal. DAE is used to extract a representation from the encoder that is robust to the introduction of noise. DAE can be constructed in many ways such as masking pixels or removing colour of the input. MAE is a kind of denoising autoencoder but different from the classical DAE in many ways. 

## **<span style="color:#e76f51;">Masked image encoding</span>**

`Masked image encoding` is used to recreate images that have been corrupted by masking. Context encoders are Convolutional Neural networks that generate patches of missing pixels on the basis of its surrounding pixels. The success of unsupervised learning using transformers in NLP has prompted a similar method to be applied to images. `iGPT` operates on sequences of pixels using a sequence transformer to predict unknown pixels autoregressive. `ViT(vision transformer)` masks patches of images and using transformers predicts the image type. Vision Transformer (ViT), using self supervision, attains excellent results compared to state-of-the-art convolutional networks while requiring substantially fewer computational resources to train. Recently, `BEiT (Bidirectional Encoder representation from Image Transformers)` is used to pretrain image transformers. We break the original image into tokens(patches) and randomly mask a few tokens and try to recover the original tokens by fine-tuning the model .

## **<span style="color:#e76f51;">Self Supervised Learning</span>**

`Self Supervised Learning` has been used significantly in language and is now being used in computer vision. Contrastive learning is used to train a CNN to classify similar and dissimilar images. Contrastive methods strongly depend on data augmentation. Autoencoding is based on a different concept, and it exhibits different behaviours as we will present.    

## **<span style="color:#e76f51;">Masked Autoencoders are Scalable Vision Learners</span>**

Masked Autoencoder (MAE) follows a simple autoencoding approach, wherein an encoder maps an input into a latent representation and decoder reconstructs the original image from the latent representation and masked tokens. The paper proposes an unsymmetric design as it allows the encoder to operate on input after removing the masked tokens from it.

## **<span style="color:#e76f51;">Masking</span>**

The image is divided into  regular non-overlapping patches and a subset of the patches are randomly sampled , masking(removing) the other patches. This is also called “random sampling”, i.e., sampling random patches without replacement, along with uniform distribution. Random Sampling with a high masking ratio eliminates redundancy and reduces the chance of being solved by extrapolation from the visible patches. A uniform distribution prevents centre bias(i.e. More masked patches near centre) and we get a highly sparse input, which helps in designing an efficient encoder. 

## **<span style="color:#e76f51;">MAE Encoder</span>**

ViT(Vision Transformer) is applied only on the patches that are not masked. The encoder embeds the visible patches by linear projection with added positional embedding and then processes it using transformer blocks. Masked tokens are vectors that indicate the presence of a missing patch to be predicted. Since masked patches are removed and no mask tokens are used, large encoders can be trained with a fraction of computation and memory. MAE Encoder is used during pre training and testing as well.

![](https://drive.google.com/uc?id=140M9fJjbdqZ7sSR_R3WrnlSPPpYbhix6)

## **<span style="color:#e76f51;">MAE Decoder</span>**

The inputs to the decoder are encoded visible patches and mask tokens. The total count of tokens is same as in input images.  All the tokens have positional embedding in them to know their location in the image. The decoder has transformer blocks in them. MAE decoders are used only during pre-training and hence can be designed independently of the encoder. Due to asymmetric design, a full set of tokens can be processed by lightweight decoders, which significantly reduce pre-training time.

![](https://drive.google.com/uc?id=1oGNCu4Jo6N90rk9yHgMtJa1wTFo-8OPY)

## **<span style="color:#e76f51;">Reconstruction Target</span>**
 
MAE reconstructs the input image by predicting the pixel values for each masked patch. Each element in the decoder’s output is a vector of pixel values representing a patch. The last layer of the decoder is a linear projection whose number of output channels equals the number of pixel values in a patch. The decoder’s output is reshaped to form a reconstructed image. The loss function compares the mean squared error between reconstructed and original images on the masked patches.We also study the results after normalising the pixel values of each masked patch as using normalised pixel values improves the accuracy of the experiments. 

## **<span style="color:#e76f51;">Implementation</span>**

In the MAE pre-training,a token for every input is generated. The token for each input patch is generated by adding positional embedding to the linear projection of the input. The list of tokens is randomly shuffled and the last portion of the list is removed depending on the masking ratio. The remaining tokens are encoded and then a list of masked tokens is appended to make the total number of  tokens equal to the number of input tokens. The full list is unshuffled to align tokens in their original position. The full list is decoded and the original image is reconstructed. This process has negligible overhead as shuffling and unshuffling operations are fast and no sparse operations(operations performed on matrices consisting of row and column numbers of non-zero numbers) are needed.

![](https://drive.google.com/uc?id=1GkA53nU5xp7hcYBHFldBLYOYn6v9vdMg)



In [None]:
!pip install -q noisereduce 

In [None]:
!pip install tensorflow_addons 

In [None]:
import os
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('max_rows', 250)
pd.set_option('max_columns', 100)

from tensorflow.keras import layers
from tensorflow.keras import models
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras

import noisereduce as nr
from math import ceil

import random

# Setting seeds for reproducibility.
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

In [None]:
#seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)

DURATION = 15
SPEC_SHAPE = (48, 128)
SAMPLE_RATE = 32000
TEST_DURATION = 5
SPEC_SHAPE = (48, 128)
FMIN = 500
FMAX = 12500

# DATA
BUFFER_SIZE = 1024
BATCH_SIZE = 256
AUTO = tf.data.AUTOTUNE
INPUT_SHAPE = (48, 128, 3)
NUM_CLASSES = 151

# OPTIMIZER
LEARNING_RATE = 5e-3
WEIGHT_DECAY = 1e-4

# TRAINING
EPOCHS = 5

# AUGMENTATION
IMAGE_SIZE = 48  # We'll resize input images to this size.
IMAGE_SIZE1 = 128  # We'll resize input images to this size.
PATCH_SIZE = 6  # Size of the patches to be extract from the input images.
NUM_PATCHES = 168#(IMAGE_SIZE // PATCH_SIZE) ** 2

# ENCODER and DECODER
LAYER_NORM_EPS = 1e-6
ENC_PROJECTION_DIM = 128
ENC_NUM_HEADS = 4
ENC_LAYERS = 3
ENC_TRANSFORMER_UNITS = [
    ENC_PROJECTION_DIM * 2,
    ENC_PROJECTION_DIM,
] # Size of the transformer layers.

## **<span style="color:#e76f51;">Masking ratio</span>**

A high masking ratio is optimal for MAE. Ratio of 75% is good for both linear probing and fine tuning. In other models of computer vision, masking ratio is less (between 20%-50%). While in language, the masking ratio is even lesser, around 15%.

## **<span style="color:#e76f51;">Mask Tokens</span>**


The masked tokens are dropped and applied again after encoding. If masked tokens are used during encoding , the accuracy drops by 14% in linear probing and 1% in fine-tuning. This is because, in pre-training, a large proportion of tokens are masked and the encoder pre-trains on these tokens which are not part of the uncorrupted image, reducing accuracy. The masked tokens are removed as then encoder pre-trains only on the patches that exist in un-corrupted images.
Removing masked token reduces the computational resources .

## **<span style="color:#e76f51;">Mask Sampling Strategy</span>**

In mask sampling strategy, a large block of pixels is removed. At a masking ratio of 50%, fine-tuning and linear probing do not degrade much. But with a higher masking ratio of 75%, the accuracy decreases considerably. Also the reconstruction observed is much blurrier due to higher training loss.Grid-wise sampling has lower training loss, however, the representation quality is low. Simple random sampling works best for MAE, with a high masking ratio providing high speed and good accuracy.



In [None]:
class Patches(layers.Layer):
    def __init__(self, patch_size=PATCH_SIZE):
        super(Patches, self).__init__()
        self.patch_size = patch_size

    def call(self, images):
        batch_size = tf.shape(images)[0]
        patches = tf.image.extract_patches(
            images=images,
            sizes=[1, self.patch_size, self.patch_size, 1],
            strides=[1, self.patch_size, self.patch_size, 1],
            rates=[1, 1, 1, 1],
            padding="VALID",
        )
        patch_dims = patches.shape[-1]
        patches = tf.reshape(patches, [batch_size, -1, patch_dims])
        return patches

In [None]:
class PatchEncoder(layers.Layer):
    def __init__(self, num_patches=NUM_PATCHES, projection_dim=ENC_PROJECTION_DIM):
        super(PatchEncoder, self).__init__()
        self.num_patches = num_patches
        self.projection = layers.Dense(units=projection_dim)
        self.position_embedding = layers.Embedding(
            input_dim=num_patches, output_dim=projection_dim
        )

    def call(self, patch):
        positions = tf.range(start=0, limit=self.num_patches, delta=1)
        encoded = self.projection(patch) + self.position_embedding(positions)
        return encoded

In [None]:
def mlp(x, dropout_rate, hidden_units):
    for units in hidden_units:
        x = layers.Dense(units, activation=tf.nn.gelu)(x)
        x = layers.Dropout(dropout_rate)(x)
    return x


In [None]:
def create_vit_classifier():
    inputs = layers.Input(shape=(IMAGE_SIZE, IMAGE_SIZE1, 3))
    # Create patches.
    patches = Patches()(inputs)
    # Encode patches.
    encoded_patches = PatchEncoder()(patches)

    # Create multiple layers of the Transformer block.
    for _ in range(ENC_LAYERS):
        # Layer normalization 1.
        x1 = layers.LayerNormalization(epsilon=LAYER_NORM_EPS)(encoded_patches)
        # Create a multi-head attention layer.
        attention_output = layers.MultiHeadAttention(
            num_heads=ENC_NUM_HEADS, key_dim=ENC_PROJECTION_DIM, dropout=0.1
        )(x1, x1)
        # Skip connection 1.
        x2 = layers.Add()([attention_output, encoded_patches])
        # Layer normalization 2.
        x3 = layers.LayerNormalization(epsilon=LAYER_NORM_EPS)(x2)
        # MLP.
        x3 = mlp(x3, hidden_units=ENC_TRANSFORMER_UNITS, dropout_rate=0.1)
        # Skip connection 2.
        encoded_patches = layers.Add()([x3, x2])
        

    # Create a [batch_size, projection_dim] tensor.
    representation = layers.LayerNormalization(epsilon=LAYER_NORM_EPS)(encoded_patches)
    representation = layers.GlobalAveragePooling1D()(representation)
    
    # Classify outputs.
    outputs = layers.Dense(NUM_CLASSES, activation="softmax")(representation)
    
    # Create the Keras model.
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

## **<span style="color:#e76f51;">WandbCallback</span>**



In [None]:
run = wandb.init(project='BirdClef2022', group='EDA')
wandb_callback = wandb.keras.WandbCallback(log_weights=True)


In [None]:
with strategy.scope():
  vit_model = create_vit_classifier()
  vit_model.compile(
      optimizer='adam',
      loss="categorical_crossentropy",#sparse_categorical_crossentropy
      metrics=["accuracy"]
  )
 
  vit_model.fit(train_x_placeholder,train_y, batch_size= 1,epochs=EPOCHS,callbacks=wandb_callback,verbose=1)
 

In [None]:
vit_model.summary()

In [None]:
vit_model.save_weights('BirdClef.h5', overwrite=True)

The core of deep learning consists of simple algorithms that scale up well. While self supervised learning methods are used in NLP due to exponential scaling models, computer vision still primarily has supervised models. In this paper, authors observe that using autoencoder - a simple self-supervised method similar to techniques in NLP- provides scalable benefits. Self-supervised learning in vision is on the same path as in NLP. Images and languages are different types of signals and these differences must be addressed carefully. Images do not have a semantic decomposition like languages and instead of attempting to remove objects like we do in language, random patches that do not most likely form semantic segment are removed. Thus, the MAE model reconstructs pixels, which are not semantic entities. This behaviour occurs by way of a rich hidden representation inside the MAE.The method predicts content based on statistics learned from the training dataset and will reflect biases in those data, including the ones with a negative societal impact or inexistent content.  
