In [33]:
!pip install -q jiwer
import os
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
from tensorflow.keras import layers
from jiwer import wer
print(f"TensorFlow Version: {tf.__version__}")

TensorFlow Version: 2.19.0


In [34]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.set_visible_devices(gpus[0], 'GPU')
        tf.config.experimental.set_memory_growth(gpus[0], True)
        print(f"Використовується GPU: {gpus[0].name}")
    except RuntimeError as e:
        print(e)
else:
    print("GPU не підключено! Рекомендується змінити Runtime на GPU.")

BATCH_SIZE = 32
EPOCHS = 50
EARLY_STOP_PATIENCE = 10
MODEL_SAVE_PATH = "deepspeech_model.keras"

FRAME_STEP = 256
FFT_LENGTH = 256
FRAME_LENGTH = 256

Використовується GPU: /physical_device:GPU:0


In [35]:
# Завантаження датасету LJSpeech
ljspeech_ds = tfds.load("ljspeech", split="train", as_supervised=False)

# Створення словника
characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]
char_to_num = layers.StringLookup(vocabulary=characters, oov_token="")
num_to_char = layers.StringLookup(vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True)

print(f"Словник створено. Розмір: {char_to_num.vocabulary_size()}")

# Функція для перетворення аудіо в спектрограму
def encode_single_sample(sample):
    audio = sample['speech']
    label = sample['text']

    # Обробка аудіо
    audio = tf.cast(audio, tf.float32)
    audio = tf.reshape(audio, [-1])

    # Створення STFT спектрограми
    spectrogram = tf.signal.stft(audio, frame_length=FRAME_LENGTH, frame_step=FRAME_STEP, fft_length=FFT_LENGTH)
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.math.pow(spectrogram, 0.5)

    # Нормалізація
    means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
    stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
    spectrogram = (spectrogram - means) / (stddevs + 1e-10)

    # Обробка тексту
    label = tf.strings.lower(label)
    label = tf.strings.unicode_split(label, input_encoding="UTF-8")
    label = char_to_num(label)

    return spectrogram, label

# Допоміжна функція для фільтрації та пакетування
def get_spec_len(spec, label):
    return tf.shape(spec)[0]

# Створення pipeline для навчання
train_dataset = (
    ljspeech_ds
    .map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)
    .cache()
    .shuffle(buffer_size=1000)
    .bucket_by_sequence_length(
        element_length_func=get_spec_len,
        bucket_boundaries=[200, 300, 400, 500, 600, 700, 800],
        bucket_batch_sizes=[BATCH_SIZE] * 8,
        pad_to_bucket_boundary=False
    )
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

# Розділення на train/val
val_dataset = train_dataset.take(10)
train_dataset = train_dataset.skip(10)

print("Дані підготовлено та пайплайн налаштовано.")

Словник створено. Розмір: 31
Дані підготовлено та пайплайн налаштовано.


In [4]:
# Функція втрат CTC (Connectionist Temporal Classification)
def CTCLoss(y_true, y_pred):
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

    loss = tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss

def build_model(input_dim, output_dim, rnn_layers=2, rnn_units=128):
    input_spectrogram = layers.Input((None, input_dim), name="input")

    x = layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectrogram)
    x = layers.Conv2D(32, kernel_size=[11, 41], strides=[2, 2], padding="same", activation="relu")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2D(32, kernel_size=[11, 21], strides=[1, 2], padding="same", activation="relu")(x)
    x = layers.BatchNormalization()(x)
    new_shape = (-1, x.shape[-2] * x.shape[-1])
    x = layers.Reshape(target_shape=new_shape, name="reshape_rnn")(x)
    x = layers.Dense(rnn_units, activation="relu")(x)
    x = layers.Dropout(0.2)(x)

    for i in range(rnn_layers):
        recurrent = layers.Bidirectional(
            layers.LSTM(rnn_units, return_sequences=True), name=f"bi_lstm_{i+1}"
        )(x)
        x = layers.Dropout(0.2)(recurrent)

    output = layers.Dense(output_dim + 1, activation="softmax", name="output")(x)

    model = keras.Model(inputs=input_spectrogram, outputs=output, name="DeepSpeech_Lite")

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4), loss=CTCLoss)

    return model

# Побудова моделі
input_dim = FFT_LENGTH // 2 + 1
model = build_model(input_dim=input_dim, output_dim=char_to_num.vocabulary_size(), rnn_units=256)
model.summary()


3. Архітектура моделі


In [5]:
print("Навчання моделі")

early_stopper = keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=EARLY_STOP_PATIENCE,
    restore_best_weights=True,
    verbose=1
)

history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=EPOCHS,
    callbacks=[early_stopper],
    verbose=1
)

model.save(MODEL_SAVE_PATH)
print(f"Модель збережено у {MODEL_SAVE_PATH}")

4. Навчання моделі
Epoch 1/50
    403/Unknown [1m500s[0m 1s/step - loss: 332.7082



[1m403/403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m504s[0m 1s/step - loss: 332.6522 - val_loss: 307.8203
Epoch 2/50
[1m403/403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m278s[0m 691ms/step - loss: 296.8080 - val_loss: 348.7922
Epoch 3/50
[1m403/403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m225s[0m 558ms/step - loss: 296.3033 - val_loss: 315.7209
Epoch 4/50
[1m403/403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m226s[0m 561ms/step - loss: 292.5270 - val_loss: 306.3141
Epoch 5/50
[1m403/403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 535ms/step - loss: 275.4167 - val_loss: 279.5395
Epoch 6/50
[1m403/403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 569ms/step - loss: 245.9366 - val_loss: 232.5275
Epoch 7/50
[1m403/403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 509ms/step - loss: 216.7764 - val_loss: 221.7677
Epoch 8/50
[1m403/403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19

In [32]:
print("Тестування моделі")

# Функція декодування
def decode_batch_predictions(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]

    output_text = []
    for result in results:
        text = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
        output_text.append(text)
    return output_text

TARGET_COUNT = 250
collected_results = []
processed_count = 0

# Проходимо по валідаційному датасету
for batch in val_dataset:
    if processed_count >= TARGET_COUNT:
        break

    spectrograms = batch[0]
    labels = batch[1]

    # Передбачення для пакету
    preds = model.predict(spectrograms, verbose=0)
    pred_texts = decode_batch_predictions(preds)

    # Обробка кожного елемента в пакеті
    for i in range(len(pred_texts)):
        if processed_count >= TARGET_COUNT:
            break

        true_label = tf.strings.reduce_join(num_to_char(labels[i])).numpy().decode("utf-8")
        pred_label = pred_texts[i]

        # Обчислення помилки
        error_rate = wer(true_label, pred_label)

        # Зберігаємо результат: (WER, Справжній текст, Передбачений текст)
        collected_results.append({
            "wer": error_rate,
            "true": true_label,
            "pred": pred_label
        })

        processed_count += 1

collected_results.sort(key=lambda x: x["wer"])

for i in range(10):
    item = collected_results[i]
    print(f"WER: {item['wer']:.4f}")
    print("-" * 40)
    print(f"Справжній:   {item['true']}")
    print(f"Передбачено: {item['pred']}")
    print("\n")

Тестування моделі
WER: 0.1765
----------------------------------------
Справжній:   in concluding that oswald was carrying a rifle in the paper bag on the morning of november  
Передбачено: in conclunding that oswald was carying a rifle in the paper bag on the morning of nevember 


WER: 0.2857
----------------------------------------
Справжній:   and kept altogether separate from the other prisoners until the day of his death
Передбачено: and kept altogether seprtd from the othe prisoners until the da of his d


WER: 0.2857
----------------------------------------
Справжній:   as it was occupied and appropriated in 
Передбачено: as it was ocupied and apropriated in 


WER: 0.2903
----------------------------------------
Справжній:   in his evidence before the inspectors he declared that for years he gave his whole time to his duties from an early hour in the morning till late in the afternoon
Передбачено: in is evidence before the inspectors he declared that fr years he gave his hold 