In [36]:
import datasets
import os
import glob
import tgt
import shutil
import parselmouth
from copy import deepcopy

In [64]:
def split_audio(sound, grid):
    tier = grid.get_tier_by_name("acoustic")
    new_tier = deepcopy(tier)
    for i in new_tier:
        if not i.text in {" ", "", "pause", "Pause", "nouse", "glot"}:
            i.text = "text"
    new_tier = new_tier.get_copy_with_same_intervals_merged()

    ortho_tier = None
    if "ow" in grid.get_tier_names():
        ortho_tier = grid.get_tier_by_name("ow")

    res = []
    for i in new_tier:
        if i.text != "text" or i.end_time <= i.start_time:
            continue
        a = tier.get_annotations_between_timepoints(i.start_time, i.end_time)
        text = " ".join(j.text for j in a)
        sound_part = sound.extract_part(i.start_time, i.end_time)
        ortho_text = ""
        if ortho_tier is not None:
            ortho_text = " ".join(j.text for j in ortho_tier.get_annotations_between_timepoints(i.start_time, i.end_time))
            ortho_text = ortho_text.encode("iso-8859-1").decode("cp1251")
        res.append((sound_part, text, ortho_text))
    return res

In [65]:
times = []
intas_files = glob.glob("m:/INTAS/complete/**/*.wav", recursive=True)
data = []
for file in intas_files:
    if "words" in file:
        continue
    print(file)
    tg_file = file.replace(".wav", ".TextGrid")
    if not os.path.exists(tg_file):
        print("No matching TextGrid")
        continue
    grid = parselmouth.read(tg_file)sz
    parselmouth.praat.call(grid, "Convert to Unicode")
    grid = grid.to_tgt()
    if "acoustic" not in grid.get_tier_names():
        print("No acoustic level in TextGrid")
        continue

    sound = parselmouth.Sound(file)
    parts = split_audio(sound, grid)
    for i, (part, text, ortho) in enumerate(parts):
        print(i)
        text = text.replace("ε", "ɛ")
        text = text.replace("γ", "ɣ")
        file_name = os.path.splitext(os.path.split(file)[-1])[0] + f"_{i}.wav" 
        data.append({"file_name": f"data/{file_name}", "transcription": text.strip(), "text": ortho.strip()})
        part.save(os.path.join("m:/intas_dataset2/data", file_name), "WAV")
        times.append(part.end_time)
    # shutil.copy(file, "m:/intas_dataset/data")

m:/INTAS/complete\andre\fpt1\av10fpt1.wav
0
1
m:/INTAS/complete\andre\fpt1\av11fpt1.wav
0
1
2
3
m:/INTAS/complete\andre\fpt1\av12fpt1.wav
0
1
m:/INTAS/complete\andre\fpt1\av13fpt1.wav
0
1
m:/INTAS/complete\andre\fpt1\av14fpt1.wav
0
1
m:/INTAS/complete\andre\fpt1\av15fpt1.wav
0
1
m:/INTAS/complete\andre\fpt1\av16fpt1.wav
0
1
2
3
m:/INTAS/complete\andre\fpt1\av17fpt1.wav
0
1
2
m:/INTAS/complete\andre\fpt1\av18fpt1.wav
0
1
m:/INTAS/complete\andre\fpt1\av19fpt1.wav
0
m:/INTAS/complete\andre\fpt1\av1fpt1.wav
0
m:/INTAS/complete\andre\fpt1\av20fpt1.wav
0
1
m:/INTAS/complete\andre\fpt1\av21fpt1.wav
0
1
2
3
m:/INTAS/complete\andre\fpt1\av22fpt1.wav
0
m:/INTAS/complete\andre\fpt1\av23fpt1.wav
0
m:/INTAS/complete\andre\fpt1\av24fpt1.wav
0
1
2
3
m:/INTAS/complete\andre\fpt1\av25fpt1.wav
0
1
2
m:/INTAS/complete\andre\fpt1\av26fpt1.wav
0
1
2
m:/INTAS/complete\andre\fpt1\av27fpt1.wav
0
1
m:/INTAS/complete\andre\fpt1\av28fpt1.wav
0
1
m:/INTAS/complete\andre\fpt1\av29fpt1.wav
0
m:/INTAS/complete\andre

In [32]:
import numpy as np

In [57]:
"êîíå÷íî".encode("iso-8859-1").decode("cp1251")

'конечно'

In [46]:
np.mean(times)

1.76254084001713

In [49]:
np.max(times)

9.998720674818838

In [48]:
len(data)

3524

In [17]:
import csv

In [66]:
with open("m:/intas_dataset2/metadata.csv", "w") as f:
    writer = csv.DictWriter(f, ["file_name", "transcription", "text"], lineterminator="\n")
    writer.writeheader()
    writer.writerows(data)

In [29]:
ds = datasets.load_dataset("m:/intas_dataset", split=[")

Resolving data files:   0%|          | 0/1524 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/1524 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [30]:
ds

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 1523
    })
})