In [1]:
import os
import sys
import torch
import collections
from itertools import product

import speechbrain as sb
from torch.cuda.amp import autocast
from hyperpyyaml import load_hyperpyyaml

device = 'cuda'

os.environ['TOKENIZERS_PARALLELISM'] = 'true'

HPARAM_FILE = 'hparams/convtasnet_llama2_lora/run_llama2_lora.yaml'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
argv = [HPARAM_FILE]
argv += ['--save_folder', 'save/convtasnet_llama2_lora']

# Zero-shot
argv += ['--case', '2Speech2FSD']
argv += ['--n_test', '5']

hparam_file, run_opts, overrides = sb.parse_arguments(argv)

# Init model
with open(hparam_file) as f:
    hparams = load_hyperpyyaml(f, overrides)
    
# Init data
test_loader = torch.utils.data.DataLoader(
    hparams['test_set'],
    **hparams['test_loader_opts']
)

# Load model weights
loaded = hparams['checkpointer'].recover_if_possible()
print(loaded)

# Put model on GPU
for name, mod in hparams['modules'].items():
    mod.to(device)
    mod.eval()
    print(f'Load {name} to {device}.')

Initialized ShortTemplate: 
shuffle: True random: True
Fetched 5 manifest files.
Actions supported:  ['0', '1', 'D', 'U']  with volume_scale = 2
Tasks supported:  ['HE', 'HVC', 'OVC', 'RHVC', 'SE', 'SR', 'S↑', 'S↓', 'TAE', 'TAR', 'TA↑', 'TA↓', 'TSE', 'TSR', 'TS↑', 'TS↓']
Use GPT prompts with prob 1.0 and handcrafted prompts with prob 0.0.


Loading checkpoint shards: 100%|██████████| 2/2 [00:29<00:00, 14.58s/it]


None
Load lora_llm to cuda.


In [3]:
hparams['tokenizer'].pad_token = '[PAD]'
if hparams['llm_mix_prec']: # Cast LLM to bf16
    hparams['llm'] = hparams['llm'].to(hparams['mix_dtype'])

In [4]:
def read_prompt(llm, tokenizer, prompt, device='cpu'):
    # Tokenize
    tokens = tokenizer(
        prompt, padding=True, return_tensors='pt'
    )['input_ids'].to(device)
    
    # Encode
    words_embed = llm(
        tokens, output_hidden_states=True
    ).hidden_states[-1] # last layer

    return words_embed[:, -1, :] # last or EOS token

In [None]:
import torch
import time

read_prompt_times = []

with torch.no_grad():
    for data in test_loader:
        mix, tar, prompt, acts = data[0:4]
        mix = mix.to(device)

        # Start timer before the function call
        start_time = time.perf_counter()

        prompt = "Remove all people talking."
        text_embed = read_prompt(hparams['lora_llm'].to('cpu'), hparams['tokenizer'], prompt)

        # End timer after the function call
        end_time = time.perf_counter()

        # Calculate the duration and store it
        duration = end_time - start_time
        read_prompt_times.append(duration)

        # Check that the embedding shape is as expected
        assert text_embed.shape == (1, 4096), f"Unexpected shape: {text_embed.shape}"

# Calculate average and standard deviation of the times
average_time = sum(read_prompt_times) / len(read_prompt_times)
std_dev_time = (sum((x - average_time) ** 2 for x in read_prompt_times) / len(read_prompt_times)) ** 0.5

print(f"Average processing time for read_prompt: {average_time:.4f} seconds")
print(f"Standard deviation of processing time: {std_dev_time:.4f} seconds")

In [15]:
import pickle
import torch

# Specify the path where you want to save the tensor
file_path = 'embedding.pkl'

# Save the tensor to a pickle file
with open(file_path, 'wb') as file:
    pickle.dump(text_embed, file)

print(f"Tensor saved to {file_path}")


Tensor saved to embedding.pkl
