In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Dropout, Attention
from tensorflow.keras.models import Model
import re
import numpy as np
# from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model

# import nltk
# import networkx as nx
import pandas as pd
import glob

In [2]:
# UPDATED BY UKASZ

import pandas as pd
import re
import glob

file_pattern = f"data/conve*.txt"
matching_files = glob.glob(file_pattern)

def format_mermaid_code(input_text):
    formatted_code = re.sub(r']\s([A-Z])', r']\n\t\1', input_text)
    formatted_code = re.sub(r'([A-Z])\s([A-Z])', r'\1\n\t\2', formatted_code)        
    formatted_code = re.sub(r'}\s([A-Z])', r'}\n\t\1', formatted_code)
    return formatted_code
    
input_texts = []
output_texts = []
df = pd.DataFrame()
for file_path in matching_files:
    with open(file_path, "r") as file:
        content = file.read()

    examples = content.split("assistant: ")

    inputs = []
    outputs = []

    i = 1
    for example in examples[1:]:
        for _ in range(100):
            input_match = re.search(f"Input{i}: (.+?)\n", example)
            output_match = re.search(f"Output{i}: (.+?)\n", example)

            if input_match and output_match:
                outputs_value = "flowchart TD \n\t" + output_match.group(1)
                outputs_value = format_mermaid_code(outputs_value)
                inputs_value = re.sub(r'Output.*', r'', input_match.group(1))
                inputs.append(inputs_value)
                outputs.append(outputs_value)
                output_texts.append(outputs_value)
                input_texts.append(inputs_value)
            i += 1  
    newDf = pd.DataFrame({"input": inputs, "output": outputs})
    df = pd.concat([df, newDf], ignore_index=True)
# print(df)

                                                  input  \
0     Start your day by exercising. Go for a run, do...   
1     Plan your meals in advance. Create a weekly me...   
2     Set aside dedicated time for reading. Pick up ...   
3     Start a gratitude journal. Reflect on positive...   
4     Stay organized with a to-do list. Prioritize t...   
...                                                 ...   
2288  Given a list of books' publication years, keep...   
2289  Suppose there is a list of students' heights. ...   
2290  Given a list of cities' populations, keep sear...   
2291  Consider a list of sales figures. Keep searchi...   
2292  Suppose there is a list of company stock price...   

                                                 output  
0     flowchart TD \n\tB --> C[Improved physical fit...  
1     flowchart TD \n\tB --> C[Healthier eating habi...  
2     flowchart TD \n\tB --> C[Expanded knowledge th...  
3     flowchart TD \n\tB --> C[Developed a positive ...  
4

In [3]:
# Define the maximum sequence length for the input and output sequences
max_input_seq_len = max(len(seq) for seq in input_texts)
max_output_seq_len = max(len(seq) for seq in output_texts)

# Define the input and output tokenizers
input_tokenizer = Tokenizer(filters='', char_level=False)
output_tokenizer = Tokenizer(filters='', char_level=False)

# Fit the input tokenizer on the preprocessed input sequences
input_tokenizer.fit_on_texts(input_texts)

# Fit the output tokenizer on the preprocessed output sequences
output_tokenizer.fit_on_texts(output_texts)

SOS_token = '<sos>'
EOS_token = '<eos>'

# Add the special tokens to the output tokenizer
output_tokenizer.word_index[SOS_token] = len(output_tokenizer.word_index) + 1
output_tokenizer.word_index[EOS_token] = len(output_tokenizer.word_index) + 1

output_tokenizer.index_word[len(output_tokenizer.index_word) + 1] = SOS_token
output_tokenizer.index_word[len(output_tokenizer.index_word) + 1] = EOS_token

In [5]:
# Define the number of encoder and decoder tokens
num_encoder_tokens = len(input_tokenizer.word_index) + 1
num_decoder_tokens = len(output_tokenizer.word_index) + 2

# Define the model architecture
latent_dim = 256

# Define encoder input layer
encoder_inputs = Input(shape=(max_input_seq_len,))
encoder_embedding = Embedding(num_encoder_tokens, latent_dim, mask_zero=True)(encoder_inputs)
encoder_dropout = Dropout(0.1)(encoder_embedding)
encoder_lstm1 = LSTM(latent_dim, return_sequences=True)(encoder_dropout)
encoder_lstm2 = LSTM(latent_dim, return_sequences=True)(encoder_lstm1)
encoder_lstm3 = LSTM(latent_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm3(encoder_lstm2)
encoder_states = [state_h, state_c]

# Define decoder input layers
decoder_inputs = Input(shape=(max_output_seq_len,))
decoder_embedding = Embedding(num_decoder_tokens, latent_dim, mask_zero=True)(decoder_inputs)
decoder_dropout = Dropout(0.1)(decoder_embedding)
decoder_lstm1 = LSTM(latent_dim, return_sequences=True)(decoder_dropout, initial_state=encoder_states)
decoder_lstm2 = LSTM(latent_dim, return_sequences=True)(decoder_lstm1, initial_state=encoder_states)
decoder_lstm3 = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm3(decoder_lstm2, initial_state=encoder_states)

# Define attention layer
attention = Attention()([decoder_outputs, encoder_outputs])

# Define decoder output layer
decoder_dense = Dense(num_decoder_tokens - 1, activation='softmax')
decoder_outputs = decoder_dense(attention)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [8]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [6]:
# Convert the input and output sequences to numerical format
encoder_input_seqs = input_tokenizer.texts_to_sequences(input_texts)
decoder_input_seqs = [[output_tokenizer.word_index['<sos>']] + output_tokenizer.texts_to_sequences([seq])[0] for seq in output_texts]
decoder_target_seqs = [seq[1:] + [output_tokenizer.word_index['<eos>']] for seq in decoder_input_seqs]

# Pad the input and output sequences to fixed length
encoder_input_seqs = pad_sequences(encoder_input_seqs, maxlen=max_input_seq_len, padding='post')
decoder_input_seqs = pad_sequences(decoder_input_seqs, maxlen=max_output_seq_len, padding='post')
decoder_target_seqs = pad_sequences(decoder_target_seqs, maxlen=max_output_seq_len, padding='post')

In [9]:
batch_size = 32
num_batches = int(np.ceil(len(decoder_target_seqs) / batch_size))

for batch_num in range(num_batches):
    start_idx = batch_num * batch_size
    end_idx = min((batch_num + 1) * batch_size, len(decoder_target_seqs))

    batch_encoder_input_seqs = encoder_input_seqs[start_idx:end_idx]
    batch_decoder_input_seqs = decoder_input_seqs[start_idx:end_idx]
    batch_decoder_target_seqs = decoder_target_seqs[start_idx:end_idx]
    
    # Tworzenie macierzy one-hot dla aktualnej porcji danych
    current_batch_size = len(batch_decoder_target_seqs)
    decoder_target_seqs_onehot = np.zeros((current_batch_size, max_output_seq_len, len(output_tokenizer.word_index) + 1))

    for i, seq in enumerate(batch_decoder_target_seqs):
        for j, token in enumerate(seq):
            decoder_target_seqs_onehot[i, j, token] = 1.0

    # Trenowanie modelu na podstawie aktualnej porcji danych
    model.train_on_batch([batch_encoder_input_seqs, batch_decoder_input_seqs], decoder_target_seqs_onehot)
    print(f'Batch {batch_num+1}/{num_batches} trained')


In [12]:
model.save('my_model.h5')
# model.save('my_model')
# model.save('my_model.keras')

In [4]:
### TEGO NIE ODPALAC, TO JEST TO CO POTRZEBUJE 120 GB RAMU

# Convert the input and output sequences to numerical format
encoder_input_seqs = input_tokenizer.texts_to_sequences(input_texts)
decoder_input_seqs = [[output_tokenizer.word_index['<sos>']] + output_tokenizer.texts_to_sequences([seq])[0] for seq in output_texts]
decoder_target_seqs = [seq[1:] + [output_tokenizer.word_index['<eos>']] for seq in decoder_input_seqs]

# Pad the input and output sequences to fixed length
encoder_input_seqs = pad_sequences(encoder_input_seqs, maxlen=max_input_seq_len, padding='post')
decoder_input_seqs = pad_sequences(decoder_input_seqs, maxlen=max_output_seq_len, padding='post')
decoder_target_seqs = pad_sequences(decoder_target_seqs, maxlen=max_output_seq_len, padding='post')

# Convert the output sequences to one-hot format
decoder_target_seqs_onehot = np.zeros((len(decoder_target_seqs), max_output_seq_len, len(output_tokenizer.word_index) + 1))
for i, seq in enumerate(decoder_target_seqs):
    for j, token in enumerate(seq):
        decoder_target_seqs_onehot[i, j, token] = 1.0

# Print the shapes of the input and output sequences
print("Encoder input shape:", encoder_input_seqs.shape)
print("Decoder input shape:", decoder_input_seqs.shape)
print("Decoder target shape:", decoder_target_seqs_onehot.shape)

MemoryError: Unable to allocate 118. GiB for an array with shape (2293, 681, 10139) and data type float64

In [None]:
# to jest niepotrzebne

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Fit the model
model.fit([encoder_input_seqs, decoder_input_seqs], decoder_target_seqs_onehot,
          batch_size=64,
          epochs=800,
          validation_split=0.1)

In [None]:
# to jest niepotrzebne

model.fit([encoder_input_seqs, decoder_input_seqs], decoder_target_seqs_onehot,
          batch_size=64,
          epochs=400,
          validation_split=0.1)

In [None]:
model = load_model('my_model.h5')

In [13]:
input_text = "To make a cup of tea, start by grinding the beans. Afterwards, add the grounds to the tea maker and add water. Finally, press the brew button and enjoy your tea"

# Tokenize the input sentence
input_seq = input_tokenizer.texts_to_sequences([input_text])[0]

# Pad the input sequence
input_seq = pad_sequences([input_seq], maxlen=max_input_seq_len, padding='post')

# Generate the output sequence using the trained model
decoder_input = np.zeros(shape=(len(input_seq), max_output_seq_len))
decoder_input[:, 0] = output_tokenizer.word_index['<sos>']
for i in range(1, max_output_seq_len):
    predictions = model.predict([input_seq, decoder_input]).argmax(axis=2)
    decoder_input[:, i] = predictions[:, i-1]

# print(decoder_input)

# Convert the output sequence to text
output_text = ''
for i in range(max_output_seq_len):
    # print(int(decoder_input[0,i]))
    # print(output_tokenizer.index_word)
    if output_tokenizer.index_word[int(decoder_input[0,i])] == '<sos>':
        continue
    if output_tokenizer.index_word[int(decoder_input[0,i])] == '<eos>':
        break
    else:
        output_text += output_tokenizer.index_word[int(decoder_input[0,i])] + ' '



In [14]:
# Print the input and output sentences
print('Input sentence:', input_text)
print('Output sentence:', output_text)

Input sentence: To make a cup of tea, start by grinding the beans. Afterwards, add the grounds to the tea maker and add water. Finally, press the brew button and enjoy your tea
Output sentence: --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> --

In [None]:
# Print the input and output sentences
print('Input sentence:', input_text)
print('Output sentence:', output_text)

In [None]:
# Print the input and output sentences
print('Input sentence:', input_text)
print('Output sentence:', output_text)

#### irrelevant

In [15]:
# Preprocess the test examples
test_encoder_input_seqs = input_tokenizer.texts_to_sequences(input_texts[4:6])
test_encoder_input_seqs = pad_sequences(test_encoder_input_seqs, maxlen=max_input_seq_len, padding='post')

# Initialize the decoder input sequences with the SOS token
test_decoder_input_seqs = np.zeros((len(test_encoder_input_seqs), max_output_seq_len))
test_decoder_input_seqs[:, 0] = output_tokenizer.word_index[SOS_token]
print(test_decoder_input_seqs)

# Generate predictions on the test examples
predictions = model.predict([test_encoder_input_seqs, test_decoder_input_seqs])

# Convert the predictions to text format
predicted_texts = []
for prediction in predictions:
    predicted_seq = []
    for token_vec in prediction:
        token_index = np.argmax(token_vec)
        token = output_tokenizer.index_word[token_index]
        if token == EOS_token:
            break
        predicted_seq.append(token)
    predicted_texts.append(' '.join(predicted_seq))

# Print the predicted texts
for i, predicted_text in enumerate(predicted_texts):
    print("Test example", i+1, "predicted text:", predicted_text)

[[10137.     0.     0. ...     0.     0.     0.]
 [10137.     0.     0. ...     0.     0.     0.]]
Test example 1 predicted text: --> e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[take e[tak