### Setup the environment

In [None]:
!chmod +x setup_env.sh
~./setup_env.sh

In [None]:
import sys
sys.path.append("/content/NeMo/")

### Import Statements

In [None]:
import os
import datetime
import nemo.collections.asr as nemo_asr
from utils import load_waveform, extract_prosodic_features, pad_tensor
from config import CONFIG
from torch.utils.data import DataLoader
from model import ClassficiationHead, StressClassifier
from train_test import train, test

### Make the dataset Ready

Keep the `Denoise_train.rar`, `label.csv` file in the current path

In [None]:
# If the dataset is compressed then only use this
!apt install rar unrar
!unrar x /content/Denoise_train.rar

In [17]:
!mkdir /content/input_ready/

Make the dataset compatable with the nemo preprocessor and keep it in the folder named `/Dataset/input_ready/`

In [None]:
# Google Drive
Raw_DatasetPath = CONFIG["raw_audio_path"]
InputReady_DatasetPath = "/content/Dataset/input_ready/"

# Local 
# Raw_DatasetPath = "./Dataset/Denoise_train/" 
# InputReady_DatasetPath = "./Dataset/input_ready/"

In [None]:
files = os.listdir(Raw_DatasetPath)

for file in files[0:10]:
  !ffmpeg -i Raw_DatasetPath+file -ac 1 -ar 16000 InputReady_DatasetPath+file

### Load the dataset

In [None]:
AudDataset = AudioDataset(audio_paths=InputReady_DatasetPath, csv_path=CONFIG["train_csv_path"], preprocessor=model.preprocessor, device)
AudDataLoader = DataLoader(AudDataset, batch_size=32, shuffle=True)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

### Load the nemo model

In [None]:
model = nemo_asr.models.ASRModel.from_pretrained("ai4bharat/indicconformer_stt_hi_hybrid_rnnt_large")
encoder = model.encoder

Fix the `encoder_output_shape` and `prosody_features_shape` by manually passing a `.wav` file to the encoder and opensmile. 

In [None]:
f, f_len = load_waveform("./content/dummy_file.wav", encoder.preprocessor, CONFIG["max_audio_sequence_length"], model.device)
encoder_output_shape = (f[1],f[2])
f_pros = extract_prosodic_features("./content/dummy_file.wav", CONFIG["max_audio_sequence_length"])
prosody_shape = (f_pros(1),f_pros(2))

`Freeze` the encoder parameters to ensure encoder is not getting trained during finetuning

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.freeze()
encoder = encoder.to(device)

### Load the custom model

In [None]:
classifier_head = ClassificationHead(encoder_output_shape=encoder_output_shape,
                                     prosody_shape=prosody_shape,
                                     max_output_seq_length=CONFIG["max_output_token_length"],
                                     word_level_feature_dim=128)

In [None]:
epochs = CONFIG["epochs"]
batch_size = CONFIG["batch_size"]
learning_rate = CONFIG["lr"]

### Patch the encoder and CustomModel

In [None]:
CustomModel = StressClassifier(encoder=encoder, classifier_head=classifier_head).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.classifier_head.parameters(), lr=learning_rate)

### Train the model

In [None]:
train(CustomModel, dataloader, optimizer, criterion, device, epochs)

In [None]:
save_as = CONFIG["model_save_path"] + f"/Pretrained_Saved({datetime.date.now().strftime("%Y-%m-%d %H:%M:%S")})"
torch.save(CustomModel.state_dict(), save_as)
print(f"Model saved to {save_path}")

In [None]:
model_loaded = StressClassifier(encoder=encoder, classifier_head=classifier_head).to(device)
model_loaded.load_state_dict(torch.load(save_as))

### Test the model

In [None]:
# Testing on train data
df = pd.read_excel(CONFIG["train_csv_path"])
df = df.drop("Transcript", axis=1)
merged_df = df.copy()
merged_df['Label'] = df.iloc[:, 1:].values.tolist()
merged_df['Label'] = merged_df['Label'].apply(lambda x: [i for i in x if pd.notna(i)])
# Keep only 'Audio Link' and the new merged column

training_labels = merged_df[['Audio Link', 'Label']].to_dict(orient="records")
test_acc_on_training_data = []
for i in res:
    if "denoised_"+i["Audio Link"]+".wav" in os.listdir("/content/input_ready/"):
        audio_file_name = audio_paths + "/denoised_" + i["Audio Link"] + ".wav" 
        preds = test(audio_file_name, model_loaded, encoder.preprocessor, device)
        expected = pad_tensor(i["Label"], int(CONFIG["max_output_length"]) - len(i["Label"]))
        crct_pred += (preds == expected.bool()).sum().item()
        actual_pred += labels.numel()
        test_acc_on_training_data.append(100 * crct_preds/actual_pred)