In [None]:
import torch
import nemo.collections.asr as nemo_asr 
from pathlib import Path

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import load_dataset, Audio
from openvoice.api import ToneColorConverter
from melo.api import TTS

BASE_PATH = Path("/mnt/sdg/tzhu/llm")

In [None]:

wav_path = BASE_PATH / "sample.flac"

if not wav_path.exists():
    print("Downloading tiny demo clip (no decoding)…")
    ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
    ds = ds.cast_column("audio", Audio(decode=False))  # <-- critical: returns file path/bytes, no torchcodec
    rec = ds[0]["audio"]  # {'bytes': b'...', 'path': '1272-128104-0000.flac'}
    with open(wav_path, "wb") as f:
        f.write(rec["bytes"])
    print(f"✅ Saved sample audio to {wav_path}")

# ---------- run NeMo ASR ----------
print("📥 Loading nvidia/canary-1b-flash…")
asr = nemo_asr.models.EncDecMultiTaskModel.from_pretrained("nvidia/canary-1b-flash")

print("🗣️ Transcribing…")
out = asr.transcribe([str(wav_path)], batch_size=1)
transcript = out[0].text if (out and hasattr(out[0], "text")) else out[0]
print("✏️ Transcript:", transcript)

In [None]:
# -------- LLaMA 4 Scout 17B-16E --------
llama_model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"

llama_local_path = f"{BASE_PATH}/{llama_model_id}"

llama_tokenizer = AutoTokenizer.from_pretrained(
    llama_local_path, 
    local_files_only=True
)
llama_model = AutoModelForCausalLM.from_pretrained(
    llama_local_path,  
    device_map='auto',
    trust_remote_code=True,
    torch_dtype=torch.float16,
    local_files_only=True  
)
# Test LLaMA
llama_pipe = pipeline("text-generation", model=llama_model, tokenizer=llama_tokenizer)
print("LLaMA test:")
print(llama_pipe("Summarise this candidate's strengths based on the interview notes: ", max_new_tokens=50)[0]["generated_text"])

In [None]:
# import nltk
# nltk.download('averaged_perceptron_tagger_eng')
# nltk.download('punkt')

ckpt_converter = f"{BASE_PATH}/OpenVoice/checkpoints_v2/converter"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
output_dir = 'outputs_v2'

# Load converter
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')

# Create output directory
Path(output_dir).mkdir(exist_ok=True)

# Load TTS for generating test audio
tts_model = TTS(language='EN', device=device)

# Generate test audio
text = "Hello, this is a test of OpenVoice version two to generate audio."
src_path = f"{output_dir}/test_tts.wav"
tts_model.tts_to_file(text, speaker_id=0, output_path=src_path)
print(f"Generated TTS: {src_path}")
