In [14]:
import os
import pandas as pd
import librosa
import tensorflow as tf
import numpy as np
import string

class TextTransform:
    """Mapira slova alfabeta u brojeve i obrunto"""
    def __init__(self):
        alphabet = string.ascii_lowercase + " ,"
        self.char_map = {char: idx + 1 for idx, char in enumerate(alphabet)}
        self.index_map = {idx: char for char, idx in self.char_map.items()}

    def text_to_int(self, text):
        return [self.char_map[char] for char in text if char in self.char_map]

    def int_to_text(self, labels):
        return ''.join([self.index_map[label] for label in labels if label in self.index_map])


text_transform = TextTransform()



In [5]:
class MelSpectrogramDataset:
    def __init__(self, tsv_files, audio_dir, text_transform):
        self.metadata = self._load_and_filter_metadata(tsv_files, audio_dir)
        self.audio_dir = audio_dir
        self.text_transform = text_transform

    def _load_and_filter_metadata(self, tsv_files, audio_dir):
        """Ucitava i filtrira samo fajlove koji postoje"""
        all_metadata = []
        for tsv_file in tsv_files:
            df = pd.read_csv(tsv_file, delimiter='\t')
            df = df[df['path'].apply(lambda x: os.path.exists(os.path.join(audio_dir, x)))]
            all_metadata.append(df)
        return pd.concat(all_metadata, ignore_index=True)

    def generator(self):
        for _, row in self.metadata.iterrows():
            audio_path = os.path.join(self.audio_dir, row['path'])
            label_text = row['sentence'].lower()

            label = tf.convert_to_tensor(self.text_transform.text_to_int(label_text), dtype=tf.int32)

            y, sr = librosa.load(audio_path, sr=None)
            S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
            S_dB = librosa.power_to_db(S, ref=np.max)
            S_dB = (S_dB - S_dB.min()) / (S_dB.max() - S_dB.min())
            spectrogram = tf.convert_to_tensor(S_dB, dtype=tf.float32)
            spectrogram = tf.expand_dims(spectrogram, axis=-1)

            input_length = tf.shape(spectrogram)[1]
            label_length = tf.size(label)

            yield spectrogram, label, input_length, label_length


In [None]:
tsv_valid = '/content/drive/MyDrive/corpus/validated.tsv'
tsv_other = '/content/drive/MyDrive/corpus/other.tsv'
tsv_invalid = '/content/drive/MyDrive/corpus/invalidated.tsv'
audio_dir = '/content/drive/MyDrive/corpus/clips/'


train_dataset = MelSpectrogramDataset([tsv_valid, tsv_other], audio_dir, text_transform)
test_dataset = MelSpectrogramDataset([tsv_invalid], audio_dir, text_transform)

In [None]:
def serialize_example(spectrogram, label, input_length, label_length):
    spec_serialized = tf.io.serialize_tensor(spectrogram).numpy()
    label_serialized = tf.io.serialize_tensor(label).numpy()
    feature = {
        'spectrogram': tf.train.Feature(bytes_list=tf.train.BytesList(value=[spec_serialized])),
        'label': tf.train.Feature(bytes_list=tf.train.BytesList(value=[label_serialized])),
        'input_length': tf.train.Feature(int64_list=tf.train.Int64List(value=[input_length])),
        'label_length': tf.train.Feature(int64_list=tf.train.Int64List(value=[label_length])),
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()


def write_tfrecord(dataset, tfrecord_path):
    """Upisuje Dataset u TFRecord fajl za brzo ucitavanje"""
    with tf.io.TFRecordWriter(tfrecord_path) as writer:
        for spectrogram, label, input_length, label_length in dataset.generator():
            spec_np = spectrogram.numpy()
            label_np = label.numpy()
            input_length_np = int(input_length.numpy())
            label_length_np = int(label_length.numpy())

            example = serialize_example(spec_np, label_np, input_length_np, label_length_np)
            writer.write(example)
    print(f"TFRecord written to {tfrecord_path}")


write_tfrecord(train_dataset, '/content/drive/MyDrive/corpus/train.tfrecord')

write_tfrecord(test_dataset, '/content/drive/MyDrive/corpus/test.tfrecord')

  S_dB = (S_dB - S_dB.min()) / (S_dB.max() - S_dB.min())


TFRecord written to /content/drive/MyDrive/corpus/train.tfrecord
TFRecord written to /content/drive/MyDrive/corpus/test.tfrecord


In [1]:
def parse_example(serialized_example, max_width=800, max_label_length=200):
    feature_description = {
        'spectrogram': tf.io.FixedLenFeature([], tf.string),
        'label': tf.io.FixedLenFeature([], tf.string),
        'input_length': tf.io.FixedLenFeature([], tf.int64),
        'label_length': tf.io.FixedLenFeature([], tf.int64)
    }

    example = tf.io.parse_single_example(serialized_example, feature_description)

    spectrogram = tf.io.parse_tensor(example['spectrogram'], out_type=tf.float32)
    label = tf.io.parse_tensor(example['label'], out_type=tf.int32)
    input_length = example['input_length']
    label_length = example['label_length']

    spectrogram = tf.image.resize_with_pad(spectrogram, 128, max_width)
    label = tf.pad(label, [[0, max_label_length - tf.shape(label)[0]]], constant_values=0)

    return spectrogram, label, input_length, label_length

In [2]:
def load_tfrecord_dataset(tfrecord_path, batch_size=32, shuffle=True):
    dataset = tf.data.TFRecordDataset(tfrecord_path)

    dataset = dataset.map(parse_example, num_parallel_calls=tf.data.AUTOTUNE)

    if shuffle:
        dataset = dataset.shuffle(buffer_size=1000)
    dataset = dataset.batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)

    return dataset

In [6]:
train_dataset = load_tfrecord_dataset('/content/drive/MyDrive/corpus/train.tfrecord')
test_dataset = load_tfrecord_dataset('/content/drive/MyDrive/corpus/test.tfrecord', shuffle=False)

In [7]:
for spectrogram, label, input_len, label_len in train_dataset.take(1):
    print(f"Spectrogram shape: {spectrogram.shape}")
    print(f"Label shape: {label.shape}")
    print(f"Input lengths: {input_len.numpy()}")
    print(f"Label lengths: {label_len.numpy()}")

Spectrogram shape: (32, 128, 800, 1)
Label shape: (32, 200)
Input lengths: [311 289 403 718 266 210 352 446 284 194 210 298 599 345 394 244 318 412
 453 322 385 338 453 401 462 489 599 451 419 563 417 633]
Label lengths: [20 71 60 88 69 27 85 45 24 25 40 52 81 53 85 42 60 65 77 50 54 44 70 44
 88 91 74 55 51 71 88 73]
