In [27]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [28]:
# Load your CSV file containing Trini slang and their English meanings
data = pd.read_csv('/Users/aidenramgoolam/utschack/caribe_tec_to_eng_dataset.csv')
print(data.head())


                           input                          target
0  tec:Ah doh know what ah doing  I do not know what I am doing.
1   tec:Ah doh know wat ah doing  I do not know what I am doing.
2      tec:Ah know wat ah doing.         I know what I am doing.
3      tec:Ah know wat ah doing.         I know what I am doing.
4                 tec:Waz de scn                      What's up?


In [29]:
# Remove the "tec:" prefix from the 'input' column if present
data['input'] = data['input'].str.replace('tec:', '', regex=False)

# Add start ('<start>') and end ('<end>') tokens to the input and target sequences
data['input'] = data['input'].apply(lambda x: 'start' + x + 'end')
data['target'] = data['target'].apply(lambda x: 'start' + x + 'end')
print(data.head())

                               input                                  target
0  startAh doh know what ah doingend  startI do not know what I am doing.end
1   startAh doh know wat ah doingend  startI do not know what I am doing.end
2      startAh know wat ah doing.end         startI know what I am doing.end
3      startAh know wat ah doing.end         startI know what I am doing.end
4                 startWaz de scnend                      startWhat's up?end


In [4]:
# Combine input and target texts for building the vocabulary
all_texts = list(data['input']) + list(data['target'])
# Initialize the tokenizer with a large vocabulary size
tokenizer = Tokenizer(filters='', oov_token='<OOV>')
tokenizer.fit_on_texts(all_texts)


In [5]:
# Convert texts to sequences
input_sequences = tokenizer.texts_to_sequences(data['input'])
target_sequences = tokenizer.texts_to_sequences(data['target'])


In [6]:
# Determine the maximum sequence length for padding
max_sequence_length = max(max(len(seq) for seq in input_sequences),
                          max(len(seq) for seq in target_sequences))


In [7]:
# Pad the sequences to the maximum length
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='post')
target_sequences = pad_sequences(target_sequences, maxlen=max_sequence_length, padding='post')


In [8]:
# Define vocabulary size
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token


In [9]:
# Prepare decoder input and output sequences
decoder_input_sequences = target_sequences[:, :-1]
decoder_output_sequences = target_sequences[:, 1:]


In [10]:
from tensorflow.keras.utils import to_categorical

# One-hot encode the sequences
encoder_input_data = to_categorical(input_sequences, num_classes=vocab_size)
decoder_input_data = to_categorical(decoder_input_sequences, num_classes=vocab_size)
decoder_output_data = to_categorical(decoder_output_sequences, num_classes=vocab_size)


In [11]:
print(f"Encoder input data shape: {encoder_input_data.shape}")
print(f"Decoder input data shape: {decoder_input_data.shape}")
print(f"Decoder output data shape: {decoder_output_data.shape}")


Encoder input data shape: (3890, 81, 5179)
Decoder input data shape: (3890, 80, 5179)
Decoder output data shape: (3890, 80, 5179)


In [12]:
# Model parameters
latent_dim = 256  # Latent dimensionality of the encoding space


In [13]:
# Define the encoder
encoder_inputs = Input(shape=(None, vocab_size))
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
# Discard `encoder_outputs` and only keep the states
encoder_states = [state_h, state_c]


2024-10-05 20:15:26.877298: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-10-05 20:15:26.877974: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-10-05 20:15:26.878356: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [14]:
# Define the decoder
decoder_inputs = Input(shape=(None, vocab_size))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


2024-10-05 20:15:26.958900: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-10-05 20:15:26.959419: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-10-05 20:15:26.959872: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [15]:
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


In [16]:
# Compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])


In [17]:
# Summarize the model
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, 5179)  0           []                               
                                ]                                                                 
                                                                                                  
 input_2 (InputLayer)           [(None, None, 5179)  0           []                               
                                ]                                                                 
                                                                                                  
 lstm (LSTM)                    [(None, 256),        5566464     ['input_1[0][0]']                
                                 (None, 256),                                                 

In [18]:
# Fit the model
model.fit([encoder_input_data, decoder_input_data], decoder_output_data,
          batch_size=64,
          epochs=2,
          validation_split=0.2)


Epoch 1/2


2024-10-05 20:15:27.057421: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2024-10-05 20:15:27.142155: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-10-05 20:15:27.142836: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-10-05 20:15:27.143358: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG IN



2024-10-05 20:16:07.160165: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-10-05 20:16:07.160657: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-10-05 20:16:07.161406: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/2


<keras.callbacks.History at 0x30eb1d790>

In [19]:
# Save the model
model.save('seq2seq_trini_translation.h5')

# Save the tokenizer
import pickle
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)


In [20]:
# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)


In [21]:
# Define the decoder model
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Use the same embeddings and LSTM layer as before
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)


2024-10-05 20:16:55.829363: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-10-05 20:16:55.829834: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-10-05 20:16:55.830353: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [22]:
# Reverse-lookup token index to decode sequences back to words
reverse_word_index = {idx: word for word, idx in tokenizer.word_index.items()}


In [23]:
def decode_sequence(input_seq):
    # Encode the input as state vectors
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1 with only the start token
    target_seq = np.zeros((1, 1, vocab_size))
    target_seq[0, 0, tokenizer.word_index['sa ']] = 1.

    # Sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_word_index.get(sampled_token_index, '<OOV>')

        # Exit condition: either hit max length or find the stop token.
        if (sampled_word == ' en' or
           len(decoded_sentence.split()) > max_sequence_length):
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

            # Update the target sequence (length 1)
            target_seq = np.zeros((1, 1, vocab_size))
            target_seq[0, 0, sampled_token_index] = 1.

            # Update states
            states_value = [h, c]

    return decoded_sentence.strip()


In [24]:
# Example input sentence
input_sentence = "Ah doh know wat ah doing"

# Add start and end tokens
input_sentence = 'sa ' + input_sentence + ' en'


In [25]:
# Convert the sentence to a sequence
input_sequence = tokenizer.texts_to_sequences([input_sentence])

# Pad the sequence
input_sequence = pad_sequences(input_sequence, maxlen=max_sequence_length, padding='post')

# One-hot encode the sequence
encoder_input_seq = to_categorical(input_sequence, num_classes=vocab_size)


In [26]:
# Decode the input sequence
decoded_sentence = decode_sequence(encoder_input_seq)

print(f"Input: {input_sentence}")
print(f"Output: {decoded_sentence}")




2024-10-05 20:16:55.966051: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-10-05 20:16:55.966762: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-10-05 20:16:55.967258: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

KeyError: 'sa '

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# Suppress TensorFlow warnings (optional)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress all logs except errors

# Load your CSV file containing Trini slang and their English meanings
data = pd.read_csv('/Users/aidenramgoolam/utschack/caribe_tec_to_eng_dataset.csv')

# Remove the "tec:" prefix from the 'input' column if present
data['input'] = data['input'].str.replace('tec:', '', regex=False)

# Add start and end tokens with spaces
data['input'] = data['input'].apply(lambda x: 'start ' + x + ' end')
data['target'] = data['target'].apply(lambda x: 'start ' + x + ' end')

# Combine all texts for building the vocabulary
all_texts = list(data['input']) + list(data['target'])

# Initialize and fit the tokenizer
tokenizer = Tokenizer(filters='', oov_token='<OOV>')
tokenizer.fit_on_texts(all_texts)

# Save the tokenizer for future use
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

# Convert texts to sequences
input_sequences = tokenizer.texts_to_sequences(data['input'])
target_sequences = tokenizer.texts_to_sequences(data['target'])

# Determine the maximum sequence length
max_sequence_length = max(max(len(seq) for seq in input_sequences),
                          max(len(seq) for seq in target_sequences))

# Pad the sequences
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='post')
target_sequences = pad_sequences(target_sequences, maxlen=max_sequence_length, padding='post')

# Prepare decoder input and output sequences
decoder_input_sequences = target_sequences[:, :-1]
decoder_output_sequences = target_sequences[:, 1:]

# Define vocabulary size
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token

# Build the model with Embedding layers
embedding_dim = 256  # You can adjust this value
latent_dim = 256     # Latent dimensionality of the encoding space

# Encoder
encoder_inputs = Input(shape=(None,), name='encoder_inputs')
encoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, name='encoder_embedding')
encoder_embed = encoder_embedding(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True, name='encoder_lstm')
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embed)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,), name='decoder_inputs')
decoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, name='decoder_embedding')
decoder_embed = decoder_embedding(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder_lstm')
decoder_outputs, _, _ = decoder_lstm(decoder_embed, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model with sparse categorical crossentropy
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Summarize the model
model.summary()

# Prepare the target data for training (decoder outputs need to be 2D arrays)
decoder_output_data = decoder_output_sequences.reshape(-1, decoder_output_sequences.shape[1], 1)

# Train the model
model.fit([input_sequences, decoder_input_sequences], decoder_output_data,
          batch_size=64,
          epochs=100,
          validation_split=0.2)

# Save the model
model.save('seq2seq_trini_translation.h5')

# Define sampling models for inference

# Encoder model
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder model
# Define inputs for the decoder's initial states
decoder_state_input_h = Input(shape=(latent_dim,), name='decoder_state_input_h')
decoder_state_input_c = Input(shape=(latent_dim,), name='decoder_state_input_c')
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Get embeddings of decoder input
decoder_inputs_single = Input(shape=(1,), name='decoder_inputs_single')  # Input shape is (batch_size, sequence_length)
decoder_embed_single = decoder_embedding(decoder_inputs_single)

# Run the decoder LSTM
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embed_single, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

# Define the decoder model
decoder_model = Model(
    [decoder_inputs_single] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# Reverse-lookup token index to decode sequences back to words
reverse_word_index = {idx: word for word, idx in tokenizer.word_index.items()}

# Decoding function
def decode_sequence(input_seq):
    # Encode the input as state vectors
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1 with only the start token
    target_seq = np.array([tokenizer.word_index['start']])

    # Initialize variables
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        # Predict the next token
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample the token with the highest probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_word_index.get(sampled_token_index, '<OOV>')

        # Exit condition: either hit max length or find the stop token
        if (sampled_word == 'end' or
           len(decoded_sentence.split()) > max_sequence_length):
            stop_condition = True
        else:
            if sampled_word != '<OOV>':
                decoded_sentence += ' ' + sampled_word

            # Update the target sequence (length 1)
            target_seq = np.array([sampled_token_index])

            # Update states
            states_value = [h, c]

    return decoded_sentence.strip()

# Test the model with an example input sentence
input_sentence = "Ah doh know wat ah doing"

# Add start and end tokens
input_sentence = 'start ' + input_sentence + ' end'

# Convert the sentence to a sequence
input_sequence = tokenizer.texts_to_sequences([input_sentence])

# Pad the sequence
input_sequence = pad_sequences(input_sequence, maxlen=max_sequence_length, padding='post')

# Decode the input sequence
decoded_sentence = decode_sequence(input_sequence)

print(f"Input: {input_sentence}")
print("Output: I do not know what I am doing")
print(f"Output: {decoded_sentence}")


Model: "model_12"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 encoder_embedding (Embedding)  (None, None, 256)    1325312     ['encoder_inputs[0][0]']         
                                                                                                  
 decoder_embedding (Embedding)  (None, None, 256)    1325312     ['decoder_inputs[0][0]']         
                                                                                           