In [None]:
!pip install transformers
!pip install diffusers
!pip install accelerate
!pip install datasets

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("automatic-speech-recognition", model="distil-whisper/distil-large-v3")

In [None]:
# Load model directly
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

processor = AutoProcessor.from_pretrained("distil-whisper/distil-large-v3")
model = AutoModelForSpeechSeq2Seq.from_pretrained("distil-whisper/distil-large-v3")

In [None]:
!pip install --upgrade pip
!pip install --upgrade transformers accelerate datasets[audio]

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "distil-whisper/distil-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    torch_dtype=torch_dtype,
    device=device,
)

dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
sample = dataset[0]["audio"]

result = pipe(sample)
print(result["text"])


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import gdown

# 定义Google Drive文件的URL
url = 'https://drive.google.com/file/d/1Zfm5HYCSA7Sz8jdq0zJQTrQ_khmRvlD0/view?usp=sharing'

# 定义文件保存路径
output = '/content/Alone-30sec.MP3'

# 使用gdown下载文件
gdown.download(url, output, quiet=False)

In [None]:
from google.colab import auth
import io
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload

# 授權
auth.authenticate_user()

# 建立 Drive 服務
drive_service = build('drive', 'v3')

# 下載文件
file_id = '1Zfm5HYCSA7Sz8jdq0zJQTrQ_khmRvlD0'
request = drive_service.files().get_media(fileId=file_id)
fh = io.BytesIO()
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
    status, done = downloader.next_chunk()
    print("下載進度：{0}".format(status.progress() * 100))

# 將文件保存到本地
with open('Alone-30sec.wav', 'wb') as f:
    fh.seek(0)
    f.write(fh.read())


In [28]:
result = pipe("Alone-30sec.wav")

In [30]:
result = pipe("Alone-30sec.wav", return_timestamps=True)
print(result["chunks"])


[{'timestamp': (0.0, 6.14), 'text': ' If this night is not forever, at least we are together.'}, {'timestamp': (6.74, 8.66), 'text': " I know I'm not alone."}, {'timestamp': (9.26, 11.24), 'text': " I know I'm not alone."}, {'timestamp': (12.12, 16.04), 'text': " Anywhere forever, apart, we're still together."}, {'timestamp': (16.64, 18.54), 'text': " I know I'm not alone."}, {'timestamp': (19.08, 21.96), 'text': " I know I'm not alone."}]


In [None]:
!pip install langchain_community


In [None]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    model_id="gpt2",
    task="text-generation",
    pipeline_kwargs={"max_new_tokens": 10},
)

In [37]:
# 將給定的文字轉換為列表
prompts = [inputs["prompts"]]

# 使用更新後的inputs來生成文本
outputs = hf.generate(prompts=prompts)

print(outputs)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


generations=[[Generation(text=" If this night is not forever, at least we are together.  I know I'm not alone.  I know I'm not alone.  Anywhere forever, apart, we're still together.  I know I'm not alone.  I know I'm not alone.   All of that goes on. \n")]] llm_output=None run=[RunInfo(run_id=UUID('f40efea6-f6c6-4391-a3a4-021ab6bf8288'))]
