# Contrastive pretrain

In [None]:
!pip install -q matplotlib
!pip install -q datasets
!pip uninstall -q pytorch_lightning -y
!pip install -q pytorch_lightning==1.7.0

!pip uninstall -q torchmetrics -y
!pip install -q torchmetrics==0.7.0
!pip install -q torchinfo
!pip install -q transformers

from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive/path/to/root/

import torch
import json
device = "cuda" if torch.cuda.is_available() else "cpu"

## Load models

In [None]:
from transformers import AutoTokenizer, ClapModel

CLAP = ClapModel.from_pretrained("laion/clap-htsat-unfused").to(device)
CLAP_tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

## Load data

Load latent representation and text descriptions.

In [None]:
from data_generation.nsynth import get_nsynth_dataloader

BATCH_SIZE = 16

training_dataset_path = f'data/NSynth/nsynth-STFT-train-52.hdf5'

training_dataloader = get_nsynth_dataloader(training_dataset_path, batch_size=BATCH_SIZE, shuffle=True,
                                            get_latent_representation=True, with_meta_data=True, with_timbre_emb=False, task="STFT")


In [None]:
from tools import merge_dictionaries

file_names = ["keywords_drop_0.0.json", "phrases_drop_0.0.json", "synonym2NL.json"]
dicts = []
for file_name in file_names:
    with open(f"data/NSynth/GPT/{file_name}", "r") as f:
        dicts.append(json.load(f))
labels_mapping = merge_dictionaries(dicts)

## Contrastive pretrain


In [None]:
from model.multimodal_model import train_multi_modal_model

timbre_encoder_name = "timbre_encoder_name"
timbre_encoder_Config = {"input_dim": 512, "feature_dim": 512, "hidden_dim": 1024, "num_instrument_classes": 1006, "num_instrument_family_classes": 11, "num_velocity_classes": 128, "num_qualities": 10, "num_layers": 3}

mmm_name = "multimodal_model_name"
MMM_config = {"text_feature_dim": 512, "spectrogram_feature_dim": 1024, "multi_modal_emb_dim": 512, "num_projection_layers": 2,
              "temperature": 1.0, "dropout": 0.1, "freeze_text_encoder": False, "freeze_spectrogram_encoder": False}

MMM_training_config = {"head_lr": 1e-4, "text_encoder_lr": 1e-5, "spectrogram_encoder_lr": 1e-4,
                       "head_weight_decay": 1e-6, "text_encoder_weight_decay": 1e-3, "timbre_encoder_weight_decay": 1e-6, "patience": 1}

mmm, optimizer = train_multi_modal_model(device, training_dataloader, labels_mapping, CLAP_tokenizer, CLAP, timbre_encoder_Config, MMM_config, MMM_training_config,
                            mmm_name, BATCH_SIZE, max_iter=30000, load_pretrain=False, timbre_encoder_name=timbre_encoder_name, init_loss=5.0, save_steps=2500)

## Create timbre embeddings

In [None]:
from tqdm import tqdm

mmm.eval()
print(mmm.training)
file_names = ["keywords_drop_0.0.json", "keywords_drop_0.3.json", "keywords_drop_0.5.json", "phrases_drop_0.0.json", "phrases_drop_0.2.json", "synonym2NL.json"]

dicts = []
for file_name in file_names:
    with open(f"data/NSynth/GPT/{file_name}", "r") as f:
        dicts.append(json.load(f))

encodes2texts_mapping = merge_dictionaries(dicts)
encodes2embeddings_mapping = {}

for key in tqdm(list(encodes2texts_mapping.keys())):
    texts = encodes2texts_mapping[key]
    embeddings = []
    for text in texts:
        tokenized_text = CLAP_tokenizer([text], padding=True, return_tensors="pt").to(device)
        embedding = mmm.get_text_features(**tokenized_text).to("cpu").detach().numpy()[0]
        embeddings.append([float(item) for item in embedding])
    encodes2embeddings_mapping[key] = embeddings

with open(f"data/NSynth/GPT/encodes2embeddings_mapping_TE_STFT.json", 'w', encoding='utf-8') as file:
    json.dump(encodes2embeddings_mapping, file, ensure_ascii=False, indent=4)