In [2]:
import io
import os
import numpy as np
import glob
import json
import tqdm

import torch
import pandas as pd

import whisper_timestamped as whisper
import torchaudio

from scipy.io import wavfile
import tqdm

pd.options.display.max_rows = 100
pd.options.display.max_colwidth = 1000
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

from whisperX import whisperx


In [3]:
model = whisper.load_model("large-v2", device=DEVICE)
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

Model is multilingual and has 1,541,384,960 parameters.


In [4]:
for num_data in tqdm.tqdm(range(1,6)):
    floder = f"full-data/Collect_{num_data}"
    videos = f"{floder}/script"

    for file in glob.glob(f"{videos}/*"):
        if not os.path.exists(f"{file}/whisper"):
            os.mkdir(f"{file}/whisper")
        for video in glob.glob(f"{file}/video/*"):
            try:
                text = os.path.basename(video[:-4])
                audio = whisperx.load_audio(video)
                transcription = model.transcribe(audio, language="th")
                print(transcription)
                # load alignment model and metadata
                model_a, metadata = whisperx.load_align_model(language_code="th", device=DEVICE)

                # align whisper output
                result_aligned = whisperx.align(transcription["segments"], model_a, metadata, audio, device=DEVICE, return_char_alignments=False)
                with open(f"{file}/whisper/{text}.json", "w", encoding='utf8') as outfile:
                    json.dump(result_aligned["segments"], outfile, ensure_ascii=False)
                break
            except:
                pass
        break
    break
break

  0%|          | 0/5 [00:00<?, ?it/s]

{'text': 'ในเรื่องของการใช้ชีวิตของเรา', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 2.0, 'text': 'ในเรื่องของการใช้ชีวิตของเรา', 'tokens': [50364, 15412, 4131, 24185, 30121, 18970, 20512, 18970, 6223, 37230, 43880, 4750, 17080, 6033, 7643, 9596, 11526, 20512, 18970, 43063, 50464], 'temperature': 0.0, 'avg_logprob': -0.1749568744139238, 'compression_ratio': 1.2537313432835822, 'no_speech_prob': 0.9547194242477417}], 'language': 'th'}
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize


Some weights of the model checkpoint at airesearch/wav2vec2-large-xlsr-53-th were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_v', 'wav2vec2.encoder.pos_conv_embed.conv.weight_g']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at airesearch/wav2vec2-large-xlsr-53-th and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably

SyntaxError: 'break' outside loop (706004565.py, line 26)

In [7]:
transcription

{'text': 'ในเรื่องของการใช้ชีวิตของเรา',
 'segments': [{'id': 0,
   'seek': 0,
   'start': 0.0,
   'end': 2.0,
   'text': 'ในเรื่องของการใช้ชีวิตของเรา',
   'tokens': [50364,
    15412,
    4131,
    24185,
    30121,
    18970,
    20512,
    18970,
    6223,
    37230,
    43880,
    4750,
    17080,
    6033,
    7643,
    9596,
    11526,
    20512,
    18970,
    43063,
    50464],
   'temperature': 0.0,
   'avg_logprob': -0.1749568744139238,
   'compression_ratio': 1.2537313432835822,
   'no_speech_prob': 0.9547194242477417,
   'clean_char': ['ใ',
    'น',
    'เ',
    'ร',
    'ื',
    '่',
    'อ',
    'ง',
    'ข',
    'อ',
    'ง',
    'ก',
    'า',
    'ร',
    'ใ',
    'ช',
    '้',
    'ช',
    'ี',
    'ว',
    'ิ',
    'ต',
    'ข',
    'อ',
    'ง',
    'เ',
    'ร',
    'า'],
   'clean_cdx': [0,
    1,
    2,
    3,
    4,
    5,
    6,
    7,
    8,
    9,
    10,
    11,
    12,
    13,
    14,
    15,
    16,
    17,
    18,
    19,
    20,
    21,
    22,
    23,
 

In [6]:
transcription["segments"]

[{'id': 0,
  'seek': 0,
  'start': 0.0,
  'end': 2.0,
  'text': 'ในเรื่องของการใช้ชีวิตของเรา',
  'tokens': [50364,
   15412,
   4131,
   24185,
   30121,
   18970,
   20512,
   18970,
   6223,
   37230,
   43880,
   4750,
   17080,
   6033,
   7643,
   9596,
   11526,
   20512,
   18970,
   43063,
   50464],
  'temperature': 0.0,
  'avg_logprob': -0.1749568744139238,
  'compression_ratio': 1.2537313432835822,
  'no_speech_prob': 0.9547194242477417,
  'clean_char': ['ใ',
   'น',
   'เ',
   'ร',
   'ื',
   '่',
   'อ',
   'ง',
   'ข',
   'อ',
   'ง',
   'ก',
   'า',
   'ร',
   'ใ',
   'ช',
   '้',
   'ช',
   'ี',
   'ว',
   'ิ',
   'ต',
   'ข',
   'อ',
   'ง',
   'เ',
   'ร',
   'า'],
  'clean_cdx': [0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27],
  'clean_wdx': [0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,