### Setup the environment

In [1]:
# !chmod +x setup_env.sh
# !./setup_env.sh

In [2]:
import sys
sys.path.append("/NeMo/")

### Import Statements

In [3]:
import os
import datetime
import nemo.collections.asr as nemo_asr
import torch
import torch.nn as nn
from utils import load_waveform, extract_prosodic_feature, pad_tensor, custom_audio_collate_fn
from Dataset import AudioDataset
from config import CONFIG
from torch.utils.data import DataLoader
from model import ClassificationHead, StressClassifier
from train_test import train, test
import warnings
import logging

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
logging.getLogger('nemo_logger').setLevel(logging.ERROR)
logging.getLogger('nemo').setLevel(logging.ERROR)


      def forward(ctx, input, weight, bias, gradient_accumulation_fusion,
    
      def backward(ctx, grad_output):
    


### Make the dataset Ready

Keep the `Denoise_train.rar`, `label.csv` file in the current path

<div class="alert alert-info">
    <b>Warning:</b> Run only when you are running it for the first time
</div>

In [None]:
# sudo apt install unrar
!unrar x ./Dataset/Denoise_train.rar ./Dataset/

In [None]:
!mkdir ./Dataset/input_ready

Make the dataset compatable with the nemo preprocessor and keep it in the folder named `/Dataset/input_ready/`

In [None]:
Raw_DatasetPath = CONFIG["raw_audio_path"]
InputReady_DatasetPath = "./Dataset/input_ready/"

<div class="alert alert-info">
    <b>Warning:</b> Run only when you are running it for the first time
</div>

In [None]:
files = os.listdir(Raw_DatasetPath)

for file in files:
    input_path = os.path.join(Raw_DatasetPath, file)
    output_path = os.path.join(InputReady_DatasetPath, file)
    !ffmpeg -i "{input_path}" -ac 1 -ar 16000 "{output_path}"

### Load the nemo model

In [None]:
model = nemo_asr.models.ASRModel.from_pretrained("ai4bharat/indicconformer_stt_hi_hybrid_rnnt_large")
# model = nemo_asr.models.ASRModel.restore_from("./trained_model/nemo_conformer.nemo")
encoder = model.encoder
preprocessor =model.preprocessor

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Load the dataset

In [10]:
AudDataset = AudioDataset(audio_paths=InputReady_DatasetPath, csv_path=CONFIG["train_csv_path"], preprocessor=preprocessor, device=device, max_audio_sequence_length=1325, max_token_seq_length=CONFIG["max_output_token_length"])
AudDataLoader = DataLoader(AudDataset, batch_size=CONFIG["batch_size"], collate_fn=custom_audio_collate_fn, shuffle=True)

Fix the `encoder_output_shape` and `prosody_features_shape` by manually passing a `.wav` file to the encoder and opensmile. 

In [11]:
f, f_len = load_waveform("Dataset/input_ready/denoised_ISLE_SESS0162_BLOCKE_32_sprt1.wav", preprocessor=preprocessor, max_audio_sequence_length=CONFIG["max_audio_sequence_length"], device=model.device)
encoder_output = encoder(audio_signal=f.unsqueeze(0), length=f_len)
encoder_output_shape = (encoder_output[0].shape[2],encoder_output[0].shape[1])
print("Encoder output shape : ", encoder_output_shape)
f_pros = extract_prosodic_feature("Dataset/input_ready/denoised_ISLE_SESS0162_BLOCKE_32_sprt1.wav", 256)
prosody_shape = (f_pros.shape[0],f_pros.shape[1])
print("Prosody features shape: ", prosody_shape)

Encoder output shape :  (332, 512)
Prosody features shape:  (256, 7)


`Freeze` the encoder parameters to ensure encoder is not getting trained during finetuning

In [12]:
encoder.freeze()
encoder = encoder.to(device)

### Load the custom model

In [13]:
classifier_head = ClassificationHead(encoder_output_shape=encoder_output_shape,
                                     prosody_shape=prosody_shape,
                                     max_output_seq_length=CONFIG["max_output_token_length"],
                                     word_level_feature_dim=128)

In [14]:
epochs = CONFIG["epochs"]
batch_size = CONFIG["batch_size"]
learning_rate = CONFIG["lr"]

### Patch the encoder and CustomModel

In [15]:
CustomModel = StressClassifier(encoder=encoder, classifier_head=classifier_head).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(CustomModel.classifier_head.parameters(), lr=learning_rate)

### Train the model

In [None]:
FinetunedModel, Losses_per_epoch = train(CustomModel, AudDataLoader, optimizer, criterion, device, epochs)

0.7557280659675598
1.5090954899787903
2.258742094039917
3.005930185317993
3.750882089138031
4.491439402103424
5.230790972709656
5.96363490819931
6.698762536048889
7.431289732456207
8.160521388053894
8.88975441455841
9.613057732582092
10.332906544208527
11.052258610725403
11.76850163936615
12.478931784629822
13.193441987037659
13.90116173028946
14.605876803398132
15.303130388259888
16.001166999340057
16.692153751850128
17.37298357486725
18.051457703113556
18.728675663471222
19.404915750026703
20.069322645664215
20.73434489965439
21.388903975486755
22.029774487018585
22.661827504634857
23.302262663841248
23.9454665184021
24.57511615753174
25.189745783805847
25.771448969841003
26.334835827350616
26.902216136455536
27.45138990879059
28.00093936920166
28.546236395835876
29.04363378882408
29.567468732595444
30.049302369356155
30.511942714452744
30.990413784980774
31.450751155614853
31.868471831083298
32.313245952129364
32.71542376279831
33.13604611158371
33.46513622999191
33.786157846450806


In [None]:
save_as = CONFIG["model_save_path"] + f"/Pretrained_Saved({datetime.date.today().strftime('%Y-%m-%d %H:%M:%S')})"
torch.save(CustomModel.state_dict(), save_as)
print(f"Model saved to {save_as}")

In [None]:
model_loaded = StressClassifier(encoder=encoder, classifier_head=classifier_head).to(device)
model_loaded.load_state_dict(torch.load(save_as))

### Test the model

In [None]:
import pandas as pd

# Testing on train data
df = pd.read_excel(CONFIG["train_csv_path"])
df = df.drop("Transcript", axis=1)
merged_df = df.copy()
merged_df['Label'] = df.iloc[:, 1:].values.tolist()
merged_df['Label'] = merged_df['Label'].apply(lambda x: [i for i in x if pd.notna(i)])
# Keep only 'Audio Link' and the new merged column

training_labels = merged_df[['Audio Link', 'Label']].to_dict(orient="records")
test_acc_on_training_data = []

audio_paths = "./Dataset/input_ready"

for i in training_labels:
    if "denoised_"+i["Audio Link"]+".wav" in os.listdir("./Dataset/input_ready/"):
        audio_file_name = audio_paths + "/denoised_" + i["Audio Link"] + ".wav" 
        preds = test(audio_file_name, model_loaded, model.preprocessor, device)
        expected = pad_tensor(i["Label"], int(CONFIG["max_output_length"]) - len(i["Label"]))
        crct_pred += (preds == expected.bool()).sum().item()
        actual_pred += i["Label"].numel()
        test_acc_on_training_data.append(100 * crct_pred/actual_pred)

In [None]:
from utils import extract_prosodic_feature, load_waveform
for file in os.listdir("./Dataset/input_ready"):
    pf = load_waveform("./Dataset/input_ready/"+file, preprocessor, 1352, device)[0].shape
    pros = extract_prosodic_feature("./Dataset/input_ready/"+file, 256)
    print(pf, pros.shape)