### Setup the environment

In [None]:
!chmod +x setup_env.sh
!./setup_env.sh

In [1]:
import sys
sys.path.append("/NeMo/")

### Import Statements

In [2]:
import os
import datetime
import nemo.collections.asr as nemo_asr
import torch
import torch.nn as nn
from utils import load_waveform, extract_prosodic_feature, pad_tensor, custom_audio_collate_fn
from Dataset import AudioDataset
from config import CONFIG
from torch.utils.data import DataLoader
from model import ClassificationHead, StressClassifier
from train_test import train, test
import warnings
import logging

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
logging.getLogger('nemo_logger').setLevel(logging.ERROR)
logging.getLogger('nemo').setLevel(logging.ERROR)


      def forward(ctx, input, weight, bias, gradient_accumulation_fusion,
    
      def backward(ctx, grad_output):
    


### Make the dataset Ready

Keep the `Denoise_train.rar`, `label.csv` file in the current path

<div class="alert alert-info">
    <b>Warning:</b> Run only when you are running it for the first time
</div>

In [3]:
# !unrar x ./Dataset/Denoise_train.rar ./Dataset/

In [4]:
# !mkdir ./Dataset/input_ready

Make the dataset compatable with the nemo preprocessor and keep it in the folder named `/Dataset/input_ready/`

In [5]:
# Google Drive
Raw_DatasetPath = CONFIG["raw_audio_path"]
InputReady_DatasetPath = "./Dataset/input_ready/"

<div class="alert alert-info">
    <b>Warning:</b> Run only when you are running it for the first time
</div>

In [6]:
# files = os.listdir(Raw_DatasetPath)

# for file in files[:10]:
#     input_path = os.path.join(Raw_DatasetPath, file)
#     output_path = os.path.join(InputReady_DatasetPath, file)
#     !ffmpeg -i "{input_path}" -ac 1 -ar 16000 "{output_path}"

### Load the nemo model

In [7]:
# model = nemo_asr.models.ASRModel.from_pretrained("ai4bharat/indicconformer_stt_hi_hybrid_rnnt_large")
model = nemo_asr.models.ASRModel.restore_from("./trained_model/nemo_conformer.nemo")
encoder = model.encoder
preprocessor =model.preprocessor

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Load the dataset

In [9]:
AudDataset = AudioDataset(audio_paths=InputReady_DatasetPath, csv_path=CONFIG["train_csv_path"], preprocessor=preprocessor, device=device)
AudDataLoader = DataLoader(AudDataset, batch_size=CONFIG["batch_size"], collate_fn=custom_audio_collate_fn, shuffle=True)

ValueError: not enough values to unpack (expected 3, got 2)

Fix the `encoder_output_shape` and `prosody_features_shape` by manually passing a `.wav` file to the encoder and opensmile. 

In [None]:
f, f_len = load_waveform("./Dataset/input_ready/denoised_ISLE_SESS0011_BLOCKD01_46_sprt1.wav", preprocessor=preprocessor, max_audio_sequence_length=CONFIG["max_audio_sequence_length"], device=model.device)
encoder_output_shape = (f.shape[0],f.shape[1])
print("Encoder output shape : ", encoder_output_shape)
f_pros = extract_prosodic_feature("./Dataset/input_ready/denoised_ISLE_SESS0011_BLOCKD01_46_sprt1.wav", 256)
prosody_shape = (f_pros.shape[0],f_pros.shape[1])
print("Prosody features shape: ", prosody_shape)

Encoder output shape :  (1325, 80)
Prosody features shape:  (256, 7)


`Freeze` the encoder parameters to ensure encoder is not getting trained during finetuning

In [None]:
encoder.freeze()
encoder = encoder.to(device)

### Load the custom model

In [None]:
classifier_head = ClassificationHead(encoder_output_shape=encoder_output_shape,
                                     prosody_shape=prosody_shape,
                                     max_output_seq_length=CONFIG["max_output_token_length"],
                                     word_level_feature_dim=128)

In [None]:
epochs = CONFIG["epochs"]
batch_size = CONFIG["batch_size"]
learning_rate = CONFIG["lr"]

### Patch the encoder and CustomModel

In [None]:
CustomModel = StressClassifier(encoder=encoder, classifier_head=classifier_head).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(CustomModel.classifier_head.parameters(), lr=learning_rate)

### Train the model

In [None]:
train(CustomModel, AudDataLoader, optimizer, criterion, device, epochs)

[torch.Size([444, 888]), torch.Size([252, 1080]), torch.Size([731, 601]), torch.Size([316, 1016]), torch.Size([380, 952]), torch.Size([316, 1016]), torch.Size([303, 1029]), torch.Size([514, 818])]


RuntimeError: stack expects each tensor to be equal size, but got [444, 888] at entry 0 and [252, 1080] at entry 1

In [None]:
save_as = CONFIG["model_save_path"] + f"/Pretrained_Saved({datetime.date.now().strftime("%Y-%m-%d %H:%M:%S")})"
torch.save(CustomModel.state_dict(), save_as)
print(f"Model saved to {save_as}")

In [None]:
model_loaded = StressClassifier(encoder=encoder, classifier_head=classifier_head).to(device)
model_loaded.load_state_dict(torch.load(save_as))

### Test the model

In [None]:
# Testing on train data
df = pd.read_excel(CONFIG["train_csv_path"])
df = df.drop("Transcript", axis=1)
merged_df = df.copy()
merged_df['Label'] = df.iloc[:, 1:].values.tolist()
merged_df['Label'] = merged_df['Label'].apply(lambda x: [i for i in x if pd.notna(i)])
# Keep only 'Audio Link' and the new merged column

training_labels = merged_df[['Audio Link', 'Label']].to_dict(orient="records")
test_acc_on_training_data = []
for i in res:
    if "denoised_"+i["Audio Link"]+".wav" in os.listdir("/content/input_ready/"):
        audio_file_name = audio_paths + "/denoised_" + i["Audio Link"] + ".wav" 
        preds = test(audio_file_name, model_loaded, encoder.preprocessor, device)
        expected = pad_tensor(i["Label"], int(CONFIG["max_output_length"]) - len(i["Label"]))
        crct_pred += (preds == expected.bool()).sum().item()
        actual_pred += labels.numel()
        test_acc_on_training_data.append(100 * crct_preds/actual_pred)

In [20]:
from utils import extract_prosodic_feature, load_waveform
for file in os.listdir("./Dataset/input_ready"):
    pf = load_waveform("./Dataset/input_ready/"+file, preprocessor, 1352, device)[0].shape
    pros = extract_prosodic_feature("./Dataset/input_ready/"+file, 256)
    print(pf, pros.shape)

torch.Size([1352, 80]) torch.Size([256, 7])
torch.Size([1352, 80]) torch.Size([256, 7])
torch.Size([1352, 80]) torch.Size([256, 7])
torch.Size([1352, 80]) torch.Size([256, 7])
torch.Size([1352, 80]) torch.Size([256, 7])
torch.Size([1352, 80]) torch.Size([256, 7])
torch.Size([1352, 80]) torch.Size([256, 7])
torch.Size([1352, 80]) torch.Size([256, 7])
torch.Size([1352, 80]) torch.Size([256, 7])
torch.Size([1352, 80]) torch.Size([256, 7])
