In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, GRU, Bidirectional, TimeDistributed, Conv1D, MaxPooling1D, BatchNormalization, Attention, LayerNormalization
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import mixed_precision
from sklearn.model_selection import train_test_split
import subprocess
import os

# Set mixed precision policy
mixed_precision.set_global_policy('mixed_float16')

# Load saved features
train_df = pd.read_pickle('d:/data/train_features.pkl')
adapt_df = pd.read_pickle('d:/data/adapt_features.pkl')

# Remove any rows with NaN values in 'transcript'
train_df.dropna(subset=['transcript'], inplace=True)
adapt_df.dropna(subset=['transcript'], inplace=True)

# Tokenize the transcriptions
all_transcriptions = train_df['transcript'].tolist() + adapt_df['transcript'].tolist()
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(all_transcriptions)

train_df['text_seq'] = tokenizer.texts_to_sequences(train_df['transcript'])
adapt_df['text_seq'] = tokenizer.texts_to_sequences(adapt_df['transcript'])

# Determine the maximum length of the features
max_feature_length = max(train_df['features'].apply(len).max(), adapt_df['features'].apply(len).max())

# Pad the features and text sequences
X_train = pad_sequences(train_df['features'].tolist(), maxlen=max_feature_length, padding='post', dtype='float32')
X_adapt = pad_sequences(adapt_df['features'].tolist(), maxlen=max_feature_length, padding='post', dtype='float32')

# Define a fixed sequence length for padding the text sequences
fixed_sequence_length = 589  # This should match the model output sequence length

# Pad the text sequences to the fixed sequence length
y_train = pad_sequences(train_df['text_seq'].tolist(), maxlen=fixed_sequence_length, padding='post')
y_adapt = pad_sequences(adapt_df['text_seq'].tolist(), maxlen=fixed_sequence_length, padding='post')

# Convert text sequences to numpy arrays
y_train = np.array(y_train)
y_adapt = np.array(y_adapt)

# Get the vocabulary size
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 for the padding token

# Build the enhanced model
def build_model(input_dim, output_dim, rnn_units=256, use_attention=True):
    input_data = Input(name='input', shape=(None, input_dim))
    
    # Convolutional layer
    x = Conv1D(filters=256, kernel_size=13, strides=1, padding='same', activation='relu')(input_data)
    x = MaxPooling1D(pool_size=2)(x)
    x = BatchNormalization()(x)
    
    # Bidirectional GRU layers with layer normalization and dropout
    x = Bidirectional(GRU(rnn_units, return_sequences=True))(x)
    x = LayerNormalization()(x)
    x = Dropout(0.3)(x)
    
    x = Bidirectional(GRU(rnn_units, return_sequences=True))(x)
    x = LayerNormalization()(x)
    x = Dropout(0.3)(x)
    
    # Apply attention if enabled
    if use_attention:
        x = Attention()([x, x])
    
    # TimeDistributed Dense layer for output
    y_pred = TimeDistributed(Dense(output_dim, activation='softmax'))(x)
    
    model = Model(inputs=input_data, outputs=y_pred)
    return model

input_dim = X_train.shape[-1]
output_dim = vocab_size  # Updated to match the vocabulary size

# Compile the model
model = build_model(input_dim, output_dim)
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss=SparseCategoricalCrossentropy(), metrics=['accuracy'])
model.summary()

# Split the data
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Create tf.data.Datasets
batch_size = 16
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_split, y_train_split)).shuffle(buffer_size=1024).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val_split, y_val_split)).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

# Define the WER calculation function
def wer(reference, hypothesis):
    r = reference.split()
    h = hypothesis.split()
    d = np.zeros((len(r) + 1, len(h) + 1), dtype=np.uint8)
    
    for i in range(len(r) + 1):
        for j in range(len(h) + 1):
            if i == 0:
                d[i][j] = j
            elif j == 0:
                d[i][j] = i
            else:
                d[i][j] = min(d[i - 1][j] + 1,
                              d[i][j - 1] + 1,
                              d[i - 1][j - 1] + (r[i - 1] != h[j - 1]))
    
    return d[len(r)][len(h)]

# Decode predictions function
def decode_predictions(predictions, tokenizer, max_texts=10):
    decoded_texts = []
    index_word = {v: k for k, v in tokenizer.word_index.items()}
    index_word[0] = ''  # padding index
    
    for pred in predictions[:max_texts]:  # Limit to max_texts for printing
        decoded_indices = np.argmax(pred, axis=-1)
        decoded_text = ''.join([index_word.get(i, '') for i in decoded_indices])
        decoded_texts.append(decoded_text.strip())
    
    return decoded_texts

# Create the custom callback for WER
class WERCallback(Callback):
    def __init__(self, val_data, tokenizer):
        super().__init__()
        self.val_data = val_data
        self.tokenizer = tokenizer

    def on_epoch_end(self, epoch, logs=None):
        val_x, val_y = self.val_data
        predictions = self.model.predict(val_x)
        decoded_predictions = decode_predictions(predictions, self.tokenizer)
        references = [self.tokenizer.sequences_to_texts([seq])[0] for seq in val_y]

        total_wer = 0
        num_samples = len(references)
        
        for ref, hyp in zip(references, decoded_predictions):
            total_wer += wer(ref, hyp)
        
        avg_wer = total_wer / num_samples
        print(f"\nEpoch {epoch + 1}: Validation WER: {avg_wer:.4f}")
        
        # Print the first 10 decoded texts
        print("First 10 decoded texts:")
        for text in decoded_predictions[:10]:
            print(text)

# Callbacks
wer_callback = WERCallback((X_val_split, y_val_split), tokenizer)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, min_lr=0.00001, verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1)
checkpoint = ModelCheckpoint('best_model.weights.h5', monitor='val_loss', save_best_only=True, save_weights_only=True, verbose=1)

# Train the model with the new callback
history = model.fit(
    train_dataset,
    epochs=9,  # Increase epochs to allow for more training
    validation_data=val_dataset,
    callbacks=[reduce_lr, early_stopping, checkpoint, wer_callback]
)

# Fine-tune on adaptation data
X_adapt_split, X_adapt_val_split, y_adapt_split, y_adapt_val_split = train_test_split(X_adapt, y_adapt, test_size=0.2, random_state=42)
adapt_dataset = tf.data.Dataset.from_tensor_slices((X_adapt_split, y_adapt_split)).shuffle(buffer_size=1024).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
adapt_val_dataset = tf.data.Dataset.from_tensor_slices((X_adapt_val_split, y_adapt_val_split)).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

# Fine-tune the model on the adaptation data
fine_tune_history = model.fit(
    adapt_dataset,
    epochs=9,  # You can adjust the number of epochs for fine-tuning
    validation_data=adapt_val_dataset,
    callbacks=[reduce_lr, early_stopping, checkpoint, wer_callback]
)

# Save model checkpoints
model.save('d:/data/model_checkpoint')
