In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load your CSV file containing Trini slang and their English meanings
data = pd.read_csv('/Users/aidenramgoolam/utschack/caribe_tec_to_eng_dataset.csv')

# Remove the "tec:" prefix from the 'input' column
data['input'] = data['input'].str.replace('tec:', '', regex=False)

# Add <start> and <end> tokens
data['input'] = data['input'].apply(lambda x: 'sa' + x + 'en')
data['target'] = data['target'].apply(lambda x: 'sa' + x + 'en')

# Combine all input and target texts
all_texts = list(data['input']) + list(data['target'])

# Initialize the tokenizer
tokenizer = Tokenizer()

# Fit the tokenizer on the texts
tokenizer.fit_on_texts(all_texts)

# Print the tokenizer word index to check if <start> and <end> tokens are included
print("Tokenizer word index:", tokenizer.word_index)

# Create vocabulary size variables
input_vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token
target_vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token

# Check if <start> and <end> tokens are in tokenizer.word_index
if 'sa' not in tokenizer.word_index or 'en' not in tokenizer.word_index:
    print("Error: '<start>' or '<end>' token not found in the tokenizer word index.")

# Tokenize the input and target texts
input_sequences = tokenizer.texts_to_sequences(data['input'])
target_sequences = tokenizer.texts_to_sequences(data['target'])

# Pad the sequences to ensure they are of the same length
max_sequence_length = max(max(len(seq) for seq in input_sequences), max(len(seq) for seq in target_sequences))
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='post')
target_sequences = pad_sequences(target_sequences, maxlen=max_sequence_length, padding='post')

# Prepare target data for decoder input and output
target_data_input = np.array(target_sequences)  # Decoder input (target sequences)
target_data_output = np.zeros((len(target_sequences), max_sequence_length, target_vocab_size))

# One-hot encode the target data output
for i, sequence in enumerate(target_sequences):
    for t, word in enumerate(sequence):
        target_data_output[i, t, word] = 1  # One-hot encoding


Tokenizer word index: {'en': 1, 'the': 2, 'd': 3, 'and': 4, 'of': 5, 'a': 6, 'in': 7, 'to': 8, 'tuh': 9, 'are': 10, 'is': 11, 'it': 12, 'will': 13, 'iz': 14, 'people': 15, 'not': 16, 'that': 17, 'be': 18, 'dat': 19, 'do': 20, 'on': 21, 'can': 22, 'ah': 23, 'more': 24, 'we': 25, 'dey': 26, 'they': 27, 'their': 28, 'i': 29, 'or': 30, 'he': 31, 'have': 32, 'you': 33, 'as': 34, 'sai': 35, 'for': 36, 'yuh': 37, 'hv': 38, 'fuh': 39, 'time': 40, 'saah': 41, 'one': 42, 'if': 43, 'like': 44, 'other': 45, 'than': 46, 'many': 47, 'cars': 48, 'dis': 49, 'sain': 50, 'with': 51, 'this': 52, 'all': 53, 'wit': 54, 'my': 55, 'so': 56, 'life': 57, 'example': 58, 'think': 59, 'has': 60, 'by': 61, 'also': 62, 'know': 63, 'car': 64, "'s": 65, 'way': 66, 'things': 67, 'his': 68, "n't": 69, 'sait': 70, 'who': 71, 'some': 72, 'at': 73, 'new': 74, 'which': 75, 'important': 76, 'sathe': 77, 'from': 78, 'use': 79, 'mi': 80, 'only': 81, 'an': 82, 'need': 83, 'sahe': 84, 'rel': 85, 'sad': 86, 'get': 87, 'make': 88

In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress TensorFlow debug logs

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense

# Model parameters
latent_dim = 256

# Define Encoder
encoder_inputs = Input(shape=(None, input_vocab_size))  # input_vocab_size is the size of your Trini slang vocabulary
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c]

# Define Decoder
decoder_inputs = Input(shape=(None, target_vocab_size))  # target_vocab_size is the size of your English vocabulary
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(target_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the final model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

# Summarize the model
model.summary()



Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, 4361)  0           []                               
                                ]                                                                 
                                                                                                  
 input_2 (InputLayer)           [(None, None, 4361)  0           []                               
                                ]                                                                 
                                                                                                  
 lstm (LSTM)                    [(None, 256),        4728832     ['input_1[0][0]']                
                                 (None, 256),                                                 

In [3]:
import numpy as np
import tensorflow as tf

# One-hot encode the input and target sequences to match (batch_size, sequence_length, vocab_size)
input_data = tf.keras.utils.to_categorical(input_sequences, num_classes=input_vocab_size)
target_data_input = tf.keras.utils.to_categorical(target_sequences[:, :-1], num_classes=target_vocab_size)  # Decoder input
target_data_output = tf.keras.utils.to_categorical(target_sequences[:, 1:], num_classes=target_vocab_size)  # Decoder output

# Pad target_data_input and target_data_output to ensure they have the same sequence length as input_data
target_data_input = pad_sequences(target_data_input, maxlen=max_sequence_length, padding='post')
target_data_output = pad_sequences(target_data_output, maxlen=max_sequence_length, padding='post')

# Print the new shapes to confirm they are 3D
print(f"Input data shape: {input_data.shape}")  # Should be (batch_size, sequence_length, input_vocab_size)
print(f"Target data input shape: {target_data_input.shape}")  # Should be (batch_size, sequence_length, target_vocab_size)
print(f"Target data output shape: {target_data_output.shape}")  # Should be (batch_size, sequence_length, target_vocab_size)

# # Fit the model
# model.fit([input_data, target_data_input], target_data_output,  # Use input_sequences and target_data_input
#           batch_size=64,
#           epochs=10,
#           validation_split=0.2)


Input data shape: (3890, 77, 4361)
Target data input shape: (3890, 77, 4361)
Target data output shape: (3890, 77, 4361)


In [4]:
# Fit the model
model.fit([input_data, target_data_input], target_data_output,  # Use input_sequences and target_data_input
          batch_size=64,
          epochs=10,
          validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x332129790>

In [5]:
model.save_weights('path_to_save_weights.h5')

In [6]:
model.save('path_to_save_model.h5')


In [7]:
from tensorflow.keras.models import load_model
model = load_model('path_to_save_model.h5')


In [19]:
import numpy as np
import tensorflow as tf
import pickle

# Load the saved model
model = tf.keras.models.load_model('path_to_save_model.h5')

# Example input sentence
input_sentence = "sa Ah doh know wat ah doing en"

# Preprocess the input sentence
input_sequence = tokenizer.texts_to_sequences([input_sentence])
input_sequence = pad_sequences(input_sequence, maxlen=max_sequence_length, padding='post')

# One-hot encode the input sequence
input_data = tf.keras.utils.to_categorical(input_sequence, num_classes=input_vocab_size)

# Prepare the initial decoder input (starting with the <start> token)
decoder_input = np.zeros((1, max_sequence_length))
decoder_input[0, 0] = tokenizer.word_index['sa']

# One-hot encode the initial decoder input
decoder_input_data = tf.keras.utils.to_categorical(decoder_input, num_classes=target_vocab_size)

# Function to generate the sequence
def generate_sequence(model, input_data, decoder_input_data, max_sequence_length):
    for i in range(1, max_sequence_length):
        # Predict the next token
        predictions = model.predict([input_data, decoder_input_data])
        
        # Debug: Print the predictions for each step
        print(f"Step {i}: Predictions - {predictions[0, i-1, :]}")
        
        predicted_token = np.argmax(predictions[0, i-1, :])
        
        # Update the decoder input
        decoder_input_data[0, i] = predicted_token
        
        # Debug: Print the predicted token
        print(f"Predicted token at step {i}: {predicted_token}")
        
        # Stop if the <end> token is generated
        if predicted_token == tokenizer.word_index['en']:
            break
    
    return decoder_input_data

# Generate the predicted sequence
predicted_sequence = generate_sequence(model, input_data, decoder_input_data, max_sequence_length)

# Convert predictions to text
predicted_indices = np.argmax(predicted_sequence, axis=-1).flatten()  # Flatten to ensure it's a 1D array
predicted_sentence = ' '.join([tokenizer.index_word.get(idx, '') for idx in predicted_indices if idx != 0])

print(f"Input: {input_sentence}")
print(f"Output: {predicted_sentence}")

# Generate the predicted sequence
predicted_sequence = generate_sequence(model, input_data, decoder_input_data, max_sequence_length)

# Convert predictions to text
predicted_indices = np.argmax(predicted_sequence, axis=-1).flatten()  # Flatten to ensure it's a 1D array
predicted_sentence = ' '.join([tokenizer.index_word[idx] for idx in predicted_indices if idx != 0])

print(f"Input: {input_sentence}")
print(f"Output: {predicted_sentence}")

# Save the tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


Step 1: Predictions - [4.94738435e-03 2.11515278e-03 6.28430098e-02 ... 1.33640498e-07
 1.24370999e-07 1.15353274e-07]
Predicted token at step 1: 2
Step 2: Predictions - [7.9237286e-04 2.3050422e-03 3.7595253e-03 ... 4.3877120e-05 5.3523847e-05
 3.8788890e-05]
Predicted token at step 2: 2
Step 3: Predictions - [8.1019860e-04 2.6377852e-03 3.8026697e-03 ... 4.2542433e-05 5.2725703e-05
 3.7676582e-05]
Predicted token at step 3: 2
Step 4: Predictions - [8.1095699e-04 2.6823899e-03 3.8415415e-03 ... 4.2155258e-05 5.2289794e-05
 3.7244754e-05]
Predicted token at step 4: 2
Step 5: Predictions - [8.0636161e-04 2.7373061e-03 3.8768232e-03 ... 4.1979823e-05 5.1921947e-05
 3.7096717e-05]
Predicted token at step 5: 2
Step 6: Predictions - [7.7971147e-04 3.0338154e-03 4.0461053e-03 ... 4.1296156e-05 5.0207382e-05
 3.6607849e-05]
Predicted token at step 6: 2
Step 7: Predictions - [7.2138302e-04 3.8096576e-03 4.4412962e-03 ... 3.9772734e-05 4.6521949e-05
 3.5516292e-05]
Predicted token at step 7: 2
