In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

#### Load the data from a CSV file

In [None]:
df = pd.read_csv('../nlp_data.csv')
for line in df.values[:, :-1]:
    print(line)

In [None]:
# Define the maximum length of the input and output sequences
max_input_len = max(len(line.split(' ')) for line in df.iloc[:, :-1].values.flatten() if isinstance(line, str))
max_output_len = max(len(line.split(' ')) for line in df['report'].values if isinstance(line, str))

# Print the maximum input and output lengths
print('Max input length:', max_input_len)
print('Max output length:', max_output_len)

#### Preprocess the data

In [None]:
input_texts = []
target_texts = []
for i in range(len(df)):
    input_line = df.iloc[i][:-1].to_string(index=False)
    target_line = df.iloc[i]['report']
    input_texts.append(input_line)
    target_texts.append(target_line)

#### Tokenize the input and output sequences

In [None]:
input_tokenizer = Tokenizer(filters='', lower=False)
input_tokenizer.fit_on_texts(input_texts)
input_sequences = input_tokenizer.texts_to_sequences(input_texts)
input_sequences = pad_sequences(input_sequences, maxlen=max_input_len, padding='post')
num_input_tokens = len(input_tokenizer.word_index) + 1

output_tokenizer = Tokenizer(filters='', lower=False)
output_tokenizer.fit_on_texts(target_texts)
target_sequences = output_tokenizer.texts_to_sequences(target_texts)
target_sequences = pad_sequences(target_sequences, maxlen=max_output_len, padding='post')
num_output_tokens = len(output_tokenizer.word_index) + 1

In [None]:
# Define the model architecture
latent_dim = 256
encoder_inputs = Input(shape=(max_input_len,))
encoder_embedding = Embedding(num_input_tokens, latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(max_output_len-1,))
decoder_embedding = Embedding(num_output_tokens, latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(num_output_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
max_output_len

In [None]:
# Compile the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')
model.summary()
# Train the model
batch_size = 64
epochs = 100
history = model.fit([input_sequences, target_sequences[:, :-1]], target_sequences[:, 1:], batch_size=batch_size, epochs=epochs, validation_split=0.2)

In [None]:
# Generate reports
def generate_report(input_sequence):
    encoder_input = input_tokenizer.texts_to_sequences([input_sequence])
    encoder_input = pad_sequences(encoder_input, maxlen=max_input_len, padding='post')
    decoder_input = np.zeros(shape=(1, max_output_len))
    decoder_input[0, 0] = output_tokenizer.word_index['<start>']
    for i in range(1, max_output_len):
        output_tokens = model.predict([encoder_input, decoder_input]).argmax(axis=2)
        decoder_input[0, i] = output_tokens[0, i-1]
        if output_tokens[0, i] == output_tokenizer.word_index['<end>']:
            break
    output_sequence = ' '.join(output_tokenizer.index_word[idx] for idx in decoder_input[0] if idx > 0)
