In [4]:
import torch
from transformers import pipeline, WhisperForConditionalGeneration, WhisperProcessor

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
processor = WhisperProcessor.from_pretrained(model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
model = WhisperForConditionalGeneration.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bia

In [5]:
!nvidia-smi

Sat May 25 06:19:27 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A6000               Off |   00000000:01:00.0 Off |                  Off |
| 30%   45C    P2             70W /  300W |   37397MiB /  49140MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA RTX A6000               Off |   00

In [8]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=False,
    torch_dtype=torch_dtype,
    device=device,
)

In [9]:
pipe("xtts.wav", generate_kwargs={"language": "chinese", "task": "transcribe"})

{'text': '我喜欢古琪'}

In [26]:
import librosa

In [27]:
y, sr = librosa.load("nlp_voice/690640_file.m4a", sr=16000)
pipe(y, generate_kwargs={"language": "chinese", "task": "transcribe"})

{'text': '喜欢午后之勇喜欢艾域喜欢看完喜欢相机包喜欢冰的那黑色喜欢白色喜欢sweety二五'}

In [29]:
from IPython.display import Audio

Audio(y, rate=16_000)

In [2]:
import pandas as pd
df = pd.read_excel("2k_url_id.xlsx")
df

Unnamed: 0,id,caid
0,728505,0050H00000Bift5QAB
1,728532,0056S00000IBu8AQAT
2,728553,0056S00000H9HXEQA3
3,728555,0056S00000H9HXEQA3
4,728556,0056S00000H9HXEQA3
...,...,...
1995,692682,0050H00000DiRV0QAN
1996,692683,0050H00000DiRV0QAN
1997,692684,0056S00000IUYnyQAH
1998,692688,0050H00000DiRV0QAN


In [33]:
!ls -l

total 360024
-rw-------  1 root  root      41740 May 27 05:28 2k_url_id.xlsx
drwxr-xr-x  8 sijie sijie      4096 May 24 11:36 GPT-SoVITS
-rw-r--r--  1 sijie sijie     13921 May 23 08:43 gztan.ipynb
drwxr-xr-x 19 sijie sijie      4096 May 20 09:12 miniconda3
drwxr-xr-x  2 root  root      69632 May 20 02:34 nlp_voice
-rw-------  1 root  root  368109566 May 20 03:37 nlp_voice.zip
-rw-r--r--  1 sijie sijie    407944 May 27 09:40 Untitled.ipynb
