In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/SPEECH/Code

In [None]:
!pip install wandb
!pip install jiwer

This is the main part of the code. It makes models by phases

In [1]:
from dataclasses import dataclass

import PreProcessing, PhaseOneModel, PhaseTwoModel, PhaseThreeModel, Evaluating, PhaseFourModel
import torch
import wandb



Building tokens list:  ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ', '?']
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ', '?']


In [2]:
wandb.login()
# Sweep configuration

sweep_config = {
    'method': 'random',
    'metric': {'name': 'WER', 'goal': 'minimize'},
    'name': 'PhaseFourModel',
    'parameters': {
        'wandb_init': {'value': True},
        'epochs': {'value': 400},
        'learning_rate': {'max': 0.01, 'min': 0.00001},
        'batch_size': {'value': 64},
        'n_cnn_layers': {'value': 1},  # Fixed value
        'n_rnn_layers': {'value': 1},  # Fixed value
        'rnn_dim': {'value': 64},
        'n_class': {'value': PreProcessing.NUM_CLASSES + 1},
        'n_feats': {'value': 128},
        'stride': {'value': 2},
        'dropout': {'values': [0, 0.1, 0.3]},
        'lm_weight': {'min': 0, 'max': 3},
        },
    }

[34m[1mwandb[0m: Currently logged in as: [33mronko[0m ([33mrons-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
@dataclass
class Config:
    def __init__(self, wandb_config=None):
        if wandb_config:
            self = wandb.config
        else:
            learning_rate: float = 0.01
            epochs: int = 300
            batch_size: int = 32
            wandb_init: bool = False

            hyperparams = {
                "n_cnn_layers": 1,
                "n_rnn_layers": 1,
                "rnn_dim": 128,
                "n_class": PreProcessing.NUM_CLASSES+1,
                "n_feats": 128,
                "stride": 2,
                "dropout": 0.5,
            }

In [4]:
def create_model(PhaseNumber, config=None):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using {} device".format(device))
    architecture = PhaseNumber + 'Model'
    # turn on and off wandb logging
    # start a new wandb run to track this script
    config = config

    test_dataloader = None

    # Now you can create a Dataset and DataLoader for your data
    # wavs, txts = PreProcessing.load_data(mode='train', data_path=PreProcessing.DATA_PATH)
    #
    # dataset = PreProcessing.AudioDatasetV3(wavs, txts)
    # train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=config.batch_size,
    #                                                shuffle=True,
    #                                                collate_fn=lambda x:
    #                                                PreProcessing.process_data(x))
    # wavs, txts = PreProcessing.load_data(mode='test', data_path=PreProcessing.DATA_PATH)
    # test_dataset = PreProcessing.AudioDatasetV3(wavs, txts)
    # test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=config.batch_size,
    #                                               shuffle=False,
    #                                               collate_fn=lambda x:
    #                                               PreProcessing.process_data(x)
    #                                               )

    # model = PhaseThreeModel.PhaseThreeModel(config,
    #                                         n_cnn_layers=config.hyperparams['n_cnn_layers'],
    #                                         n_rnn_layers=config.hyperparams['n_rnn_layers'],
    #                                         rnn_dim=config.hyperparams['rnn_dim'],
    #                                         n_class=config.hyperparams['n_class'],
    #                                         n_feats=config.hyperparams['n_feats'],
    #                                         stride=config.hyperparams['stride'],
    #                                         dropout=config.hyperparams['dropout'],
    #                                         )
    # model = PhaseFourModel.PhaseFourModel(config)
    wavs, txts = PreProcessing.load_data(mode='train', data_path=PreProcessing.DATA_PATH)
    dataset = PreProcessing.AudioDatasetV2(wavs, txts)
    train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=config.batch_size,
                                                   shuffle=True)

    wavs, txts = PreProcessing.load_data(mode='test', data_path=PreProcessing.DATA_PATH)
    test_dataset = PreProcessing.AudioDatasetV2(wavs, txts)
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=config.batch_size,
                                                  shuffle=False)
    model = PhaseTwoModel.PhaseTwoModel()

    return model, train_dataloader, test_dataloader, device

In [5]:
# model, train_dataloader, test_dataloader, device = create_model("PhaseTwo",
#                                                                     config=Config(wandb_init=True))

In [6]:
# criterion = torch.nn.CTCLoss(blank=PreProcessing.BLANK_IDX).to(device)
# PhaseThreeModel.train_model_phase_three(model, train_dataloader, criterion, device, test_dataloader, config=Config(
#     wandb_init=True))

In [7]:
def main_train(config=None):
    with wandb.init(config=config):
        config = wandb.config
        # config.extend({'wandb_init': True})
        print(config)
        model, train_dataloader, test_dataloader, device = create_model("PhaseFour",
                                                                        config=config)
        criterion = torch.nn.CTCLoss(blank=PreProcessing.BLANK_IDX).to(device)
        PhaseThreeModel.train_model_phase_three(model, train_dataloader, criterion, device, test_dataloader, config=config)
        torch.save(model.state_dict(), f"PhaseFourModel_{wandb.run.name}.pt")

In [None]:
def train_sweep():
    with wandb.init() as run:
        # You can call your main training function here, passing the config
        main_train()  # Assuming 'train' is the main training function

# Running the sweep
sweep_id = wandb.sweep(sweep_config, project='speechRecProj')  # Create the sweep
wandb.agent(sweep_id, train_sweep)


Create sweep with ID: 0a6qigdr
Sweep URL: https://wandb.ai/rons-team/speechRecProj/sweeps/0a6qigdr


[34m[1mwandb[0m: Agent Starting Run: znzxjdo1 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	epochs: 400
[34m[1mwandb[0m: 	learning_rate: 0.008844704238529656
[34m[1mwandb[0m: 	lm_weight: 0.1
[34m[1mwandb[0m: 	n_class: 28
[34m[1mwandb[0m: 	n_cnn_layers: 1
[34m[1mwandb[0m: 	n_feats: 128
[34m[1mwandb[0m: 	n_rnn_layers: 1
[34m[1mwandb[0m: 	rnn_dim: 64
[34m[1mwandb[0m: 	stride: 2
[34m[1mwandb[0m: 	wandb_init: True
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='0.001 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.186922…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

{'batch_size': 64, 'dropout': 0.1, 'epochs': 400, 'learning_rate': 0.008844704238529656, 'lm_weight': 0.1, 'n_class': 28, 'n_cnn_layers': 1, 'n_feats': 128, 'n_rnn_layers': 1, 'rnn_dim': 64, 'stride': 2, 'wandb_init': True}
Using cpu device


  input_sequence = torch.tensor(sequence)
  0%|          | 0/400 [00:00<?, ?it/s]

First Input:  torch.Size([1, 128, 1051]) First Label:  tensor([13, 14,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]) First Label Length:  2
Input txt: noaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
 
 
model preds:  torch.Size([1, 131, 28])
Model Output:  vvvvvveaajevvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
Epoch:  0 / 400  ( 0.0 %)
