### Prerequisites


In [None]:
import dotenv

dotenv.load_dotenv()

In [None]:
project_name = "gender_female"
encoder_id = "whisper_large_v2"


# data preparation
target_column = "gender"
target_positive_class = "female"


# training meta parameters
num_train_runs = 100
training_sample_count = 800


# training parameters
epochs = 1000
batch_size = training_sample_count
learning_rate = 0.008
embedding_dim = 1280
dropout_rate = 0.0


### Prepare Data


In [None]:
import os

import pandas as pd
import s3fs
from cavmir.training.dataset import (
    create_dataloader_from_webdataset_path,
    create_webdataset,
)
from cavmir.utils import (
    append_embeddings_to_df,
    create_training_samples_from_df,
    train_one_cav,
)

s3 = s3fs.S3FileSystem(anon=False)

dataset_prefix = os.environ["DATASET_PREFIX"]
embedding_prefix = os.environ["EMBEDDING_PREFIX"]


In [None]:
# load training set
df = pd.read_csv(os.path.join(dataset_prefix, f"train_dataset_{project_name}.csv"))
df = append_embeddings_to_df(df, embedding_prefix, encoder_id, s3)

In [None]:
# create test set for evaluation
df_test = pd.read_csv(os.path.join(dataset_prefix, f"test_dataset_{project_name}.csv"))
df_test = append_embeddings_to_df(df_test, embedding_prefix, encoder_id, s3)

create_webdataset(
    create_training_samples_from_df(df_test),
    f"datasets/{encoder_id}_test_{project_name}.tar",
)

test_dataloader = create_dataloader_from_webdataset_path(
    f"datasets/{encoder_id}_test_{project_name}.tar", batch_size=batch_size
)

### Training


In [None]:
cav_vectors = []
evaluation_metrics = []

for i in range(num_train_runs):
    cav_vector, evaluation_metric = train_one_cav(
        train_index=i + 1,
        df=df,
        project_name=project_name,
        encoder_id=encoder_id,
        target_column=target_column,
        target_positive_class=target_positive_class,
        num_train_runs=num_train_runs,
        training_sample_count=training_sample_count,
        epochs=epochs,
        batch_size=batch_size,
        learning_rate=learning_rate,
        embedding_dim=embedding_dim,
        dropout_rate=dropout_rate,
        test_dataloader=test_dataloader,
    )

    cav_vectors.append(cav_vector)
    evaluation_metrics.append(evaluation_metric)


In [None]:
import numpy as np

np.save(
    os.path.join(
        "trainings",
        project_name,
        f"cav_{project_name}.npy",
    ),
    np.array(cav_vectors),
)

In [None]:
import json

json.dump(
    evaluation_metrics,
    open(
        os.path.join(
            "trainings",
            project_name,
            f"evaluation_metrics_{project_name}.json",
        ),
        "w",
    ),
)