In [1]:
import tensorflow as tf
import pandas as pd
import math
import numpy as np

2025-01-22 16:20:45.624161: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-22 16:20:45.633715: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737559245.644981  258727 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737559245.648357  258727 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-22 16:20:45.659846: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
df_train = pd.read_csv("dataset/train.csv", sep = ";")
df_test = pd.read_csv("dataset/test.csv", sep = ";")
df_eval = pd.read_csv("dataset/evaluation.csv", sep = ";")

In [3]:
all_titles = pd.concat([df_train, df_eval, df_test]).dropna()["title"].tolist()

In [4]:
filter_vocab = set(sorted([
    ' ', '!', '"', '#', '$',
    '%','&', "'", '(', ')',
    '*', '+', ',', '-','.',
    '/', '0', '1', '2', '3',
    '4', '5', '6', '7', '8',
    '9', ':', ';', '=', '?',
    '@', 'A', 'B', 'C', 'D',
    'E', 'F', 'G', 'H', 'I',
    'J', 'K', 'L', 'M', 'N',
    'O', 'P', 'Q', 'R', 'S',
    'T', 'U', 'V', 'W', 'X',
    'Y', 'Z', '[', ']', '_',
    'a', 'b', 'c', 'd', 'e',
    'f', 'g', 'h', 'i', 'j',
    'k', 'l', 'm', 'n', 'o',
    'p', 'q', 'r', 's', 't',
    'u', 'v', 'w', 'x', 'y',
    'z', '{', '}', '“', '”',
    "<pad>"
]))

In [5]:
VOCAB_SIZE: int = len(filter_vocab)
SEQ_SIZE: int = 125

In [6]:
def filter_chars(text: str) -> str:
    global filter_vocab
    return "".join([c if c in filter_vocab else "" for c in text])

In [7]:
filtered_titles = [filter_chars(title) for title in all_titles]

In [8]:
loaded = np.load('encoder.npz')
one_hot_encoder = {key: loaded[key] for key in loaded}

In [9]:
def split_encode(text: str) -> list[list[int]]:
    global one_hot_encoder
    global SEQ_SIZE
    encoded = [one_hot_encoder[x] for x in [*text][:SEQ_SIZE]]
    padded = encoded + [one_hot_encoder["<pad>"]] * (SEQ_SIZE - len(encoded))
    return padded

In [10]:
data = np.array([split_encode(text) for text in filtered_titles])

In [11]:
BATCH_SIZE = 8

In [12]:
X = data[:, :-1, :] 
y = data[:, 1:, :]  

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((X, y))
dataset = dataset.shuffle(512).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

W0000 00:00:1737559251.028120  258727 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [None]:
model = tf.keras.Sequential([
    tf.keras.Input(shape=(SEQ_SIZE - 1, VOCAB_SIZE)),
    tf.keras.layers.LSTM(256, return_sequences=True, dropout=0.3),
    tf.keras.layers.LSTM(128, return_sequences=True, dropout=0.2),
    tf.keras.layers.Dense(VOCAB_SIZE, activation='softmax')
])

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [None]:
history = model.fit(dataset, epochs=20)

In [None]:
def encode(src: str) -> np.ndarray:
    dst = []
    for letter in [*src]:
        dst.append(one_hot_encoder[letter])
    return np.array(dst)

In [None]:
def decode(src: np.ndarray) -> str:
    dst = ""
    for coded in src:
        for key, val in one_hot_encoder.items():
            if (coded == val).all():
                dst += key
                break
    return dst

In [None]:
input_seq = encode("Breaking news: ")

In [None]:
def generate_sequence(model, start_sequence, seq_length, decode):
    current_input = np.copy(start_sequence)  
    generated_sequence = decode(current_input)  
    for _ in range(seq_length):
        predictions = model.predict(current_input[np.newaxis, ...], verbose=0) 
        
        next_char_idx = np.argmax(predictions[0, -1, :])
        
        next_char_onehot = np.zeros_like(current_input[0])
        next_char_onehot[next_char_idx] = 1

        current_input = np.roll(current_input, shift=-1, axis=0)
        current_input[-1] = next_char_onehot  
        
        generated_sequence += decode(next_char_onehot[np.newaxis, ...])
    
    return generated_sequence

In [None]:
generated_text = generate_sequence(model, input_seq, SEQ_SIZE, decode)
print(generated_text)

In [None]:
import matplotlib.pyplot as plt

history_dict = history.history

loss = history_dict['loss']  
epochs = range(1, len(loss) + 1)  

val_loss = history_dict.get('val_loss', None)
accuracy = history_dict.get('accuracy', None)
val_accuracy = history_dict.get('val_accuracy', None)

plt.figure(figsize=(12, 5))

plt.plot(epochs, loss, 'bo-', label='Training Loss') 
if val_loss:
    plt.plot(epochs, val_loss, 'ro-', label='Validation Loss') 
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

In [None]:
model.save("generator_double_LSTM.keras")