# Functools

In [3]:
import tensorflow as tf
from tensorflow.keras.models import load_model
import pandas as pd
import math
import numpy as np
import time
import random

2025-01-23 15:49:07.041108: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-23 15:49:07.226262: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737643747.290717  232051 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737643747.310982  232051 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-23 15:49:07.481324: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [7]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [8]:
loaded_encoder = np.load('encoder.npz')
one_hot_encoder = {key: loaded_encoder[key] for key in loaded_encoder}

In [9]:
def encode(src: str) -> np.ndarray:
    dst = []
    for letter in [*src]:
        dst.append(one_hot_encoder[letter])
    return np.array(dst)

In [10]:
def decode(src: np.ndarray) -> str:
    dst = ""
    for coded in src:
        for key, val in one_hot_encoder.items():
            if (coded == val).all():
                dst += key
                break
    return dst

In [11]:
def encode_generate_sequence(model, start_sequence):
    s = time.time()
    encoded = encode(start_sequence)
    current_input = np.copy(encoded)
    generated_sequence = start_sequence
    for _ in range(SEQ_SIZE):
        predictions = model.predict(current_input[np.newaxis, ...], verbose=0) 
        
        next_char_idx = np.argmax(predictions[0, -1, :])
        
        next_char_onehot = np.zeros_like(current_input[0])
        next_char_onehot[next_char_idx] = 1

        current_input = np.roll(current_input, shift=-1, axis=0)
        current_input[-1] = next_char_onehot  
        
        generated_sequence += decode(next_char_onehot[np.newaxis, ...])
    e = time.time()
    print(e-s)
    return generated_sequence

# Prepare data

In [12]:
df_train = pd.read_csv("dataset/train.csv", sep = ";")
df_test = pd.read_csv("dataset/test.csv", sep = ";")
df_eval = pd.read_csv("dataset/evaluation.csv", sep = ";")
all_titles = pd.concat([df_train, df_eval, df_test]).dropna()["title"].tolist()

In [13]:
filter_vocab = set(sorted([
    ' ', '!', '"', '#', '$',
    '%','&', "'", '(', ')',
    '*', '+', ',', '-','.',
    '/', '0', '1', '2', '3',
    '4', '5', '6', '7', '8',
    '9', ':', ';', '=', '?',
    '@', 'A', 'B', 'C', 'D',
    'E', 'F', 'G', 'H', 'I',
    'J', 'K', 'L', 'M', 'N',
    'O', 'P', 'Q', 'R', 'S',
    'T', 'U', 'V', 'W', 'X',
    'Y', 'Z', '[', ']', '_',
    'a', 'b', 'c', 'd', 'e',
    'f', 'g', 'h', 'i', 'j',
    'k', 'l', 'm', 'n', 'o',
    'p', 'q', 'r', 's', 't',
    'u', 'v', 'w', 'x', 'y',
    'z', '{', '}', '“', '”',
    "<pad>"
]))

In [14]:
VOCAB_SIZE: int = len(filter_vocab)
SEQ_SIZE: int = 100
NUM_PER_CLASS: int = 5000

In [15]:
def filter_chars(text: str) -> str:
    global filter_vocab
    return "".join([c if c in filter_vocab else "" for c in text])

In [16]:
filtered_titles = random.sample([filter_chars(title) for title in all_titles], NUM_PER_CLASS)
np.array(filtered_titles).shape

(5000,)

In [17]:
def get_first_word(text: str) -> str:
    return text.split()[0] + " "

In [18]:
first_words = np.vectorize(get_first_word)(filtered_titles)

# Generators

## Basic generator

In [14]:
model_basic = load_model("generator.keras")
model_basic.summary()

W0000 00:00:1737637954.892922   31306 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
  saveable.load_own_variables(weights_store.get(inner_path))


In [15]:
np.vectorize(lambda x: encode_generate_sequence(model_basic, x))(first_words[:10])

3.8542144298553467
3.684568405151367
3.740508556365967
3.6862576007843018
3.712608814239502
3.7235515117645264
3.7014143466949463
3.71304988861084
3.8375775814056396
3.878814220428467
3.7282466888427734


array(['Munich threatens on the president threatens on the president threatens on the president threatens on the pr',
       'Factbox: Trump says the media and the Syria to resign as any to start of the presidenting the meeting with th',
       "Catalan line of the U.S. conservative in the Syrian PM says 'resigning a state status start of the U.S. cons",
       'newsbud brexit breaks that would discusses about the man and the military consideration can be deplaying the',
       "BUSTED: Trump says what the presidents and a strong U.S. conservative in the Syrian PM says 'resigning a sta",
       'JAMES CONSERVI WERK CAUGHT READS THE RAPE THE GOOD BANKADE SHAMP REPORTELE HATE OF THE New York Time Trump',
       'Japan to accuser tractons and any military to start to meeting is a Trump says Trump says Trump says Trump',
       'What where asks to defenserselectiving that in the Middle state de in the Middle state de in the Middle s',
       'War art to Trump says on Trump says on Trump sa

In [16]:
np.mean([
3.8758692741394043,
3.79659104347229,
3.84424090385437,
3.8041341304779053,
4.0363450050354,
3.944906711578369,
3.87833309173584,
3.9055445194244385,
3.9333903789520264,
3.9527390003204346,
3.84543514251709
])

np.float64(3.8925026546825063)

# Generator with dropout

In [17]:
model_dropout = load_model("generator_dropout.keras")
model_dropout.summary()

In [18]:
np.vectorize(lambda x: encode_generate_sequence(model_dropout, x))(first_words[:10])

3.9579155445098877
3.843723773956299
4.148249626159668
3.9399075508117676
4.045276165008545
4.342906951904297
4.11477255821228
3.906012773513794
3.9440577030181885
4.393810033798218
4.4377121925354


array(['Munich state Democration of the refugee to state Democration of the refugee to state Democration of the ref',
       'Factbox: Trump says he will not to state to state to state to state to state to state to state to state to st',
       'Catalan says he will not to Trump says he will not to Trump says he will not to Trump says he will not to Tr',
       'newsbud to state party says he will not to Trump says he will not to Trump says he will not to Trump says he',
       'BUSTED: Trump says he will not to Trump says he will not to Trump says he will not to Trump says he will not',
       'JAMES SHOCKING STATE THE PRESIDENT THE BREAKING STATE THE PRESIDENT THE BREAKING STATE THE PRESIDENT THE B',
       'Japan says Trump says Trump says Trump says Trump says Trump says Trump says Trump says Trump says Trump s',
       'What House to state to state to state to state to state to state to state to state to state to state to s',
       'War will state trave to Senate trave to Senate 

In [19]:
np.mean([
4.105676651000977,
3.906489133834839,
3.9925618171691895,
3.935142993927002,
4.240764379501343,
3.926100015640259,
4.349726676940918,
4.393689393997192,
3.9914186000823975,
4.023019075393677,
4.024837493896484
])

np.float64(4.080856930125844)

## Generator with dropout more complex 

In [20]:
model_dropout_complex = load_model("generator_dropout_complex.keras")
model_dropout_complex.summary()

In [21]:
np.vectorize(lambda x: encode_generate_sequence(model_dropout_complex, x))(first_words[:10])

4.149566888809204
3.9595963954925537
4.109914541244507
3.983436107635498
3.9817748069763184
4.228386640548706
4.270430564880371
4.262691497802734
4.03720760345459
4.1197381019592285
4.112939119338989


array(['Munich tour state Department to stand of the protest the protest the protest the protest the protest the pr',
       'Factbox: Trump says he will not in the state of the president to state and trump to state and trump to state ',
       'Catalan leader Sanders say the pro-top congress to state of the Senate Republicans to stop the trump to stat',
       'newsbud are to Senate Republicans to stop the trump to state of the Senate Republicans to stop the trump to ',
       'BUSTED: Trump says he will not the media says he will not the media says he will not the media says he will ',
       'JAMES CAMPANGER CALIST AND SHOCKING COMMUNIT TO THE SECRET PRESIDENT STATE DONT STATE DONT STATE DONT STAT',
       'Japan says Trump says Trump says Trump says Trump says Trump says Trump says Trump says Trump says Trump s',
       'What Trump state of the U.S. state of the U.S. state of the U.S. state of the U.S. state of the U.S. stat',
       'War of Trump at Trump at Trump at Trump at Trum

In [22]:
np.mean([
3.907470703125,
3.993860960006714,
4.064735412597656,
4.2148449420928955,
4.136392831802368,
4.046360731124878,
3.999328851699829,
4.1442999839782715,
4.346022367477417,
4.159832239151001,
4.3394365310668945
])

np.float64(4.122962323102084)

## Generator bidirectional

In [23]:
model_bidirect = load_model("generator_bidirect.keras")
model_bidirect.summary()

In [24]:
np.vectorize(lambda x: encode_generate_sequence(model_bidirect, x))(first_words[:10])

4.20767068862915
3.9973371028900146
4.1164960861206055
3.9410059452056885
4.027256965637207
3.9092938899993896
3.9169347286224365
3.929922580718994
4.295206308364868
4.070997476577759
4.06782078742981


array(['Munich back to be to readers to be to readers to be to readers to be to readers to be to readers to be to r',
       'Factbox: Senial State to be to be to be to be to be to be to be to be to be to be to be to be to be to be to ',
       'Catalan of to readers to be to readers to be to readers to be to readers to be to readers to be to readers t',
       'newsbud of to be to readers to be to readers to be to readers to be to readers to be to readers to be to rea',
       'BUSTED: Belie Saria to be to readers to be to readers to be to readers to be to readers to be to readers to ',
       'JAMES BOLDER Comey to be to be to be to be to be to be to be to be to be to be to be to be to be to be to ',
       'Japan to be to be to be to be to be to be to be to be to be to be to be to be to be to be to be to be to b',
       'What walthere to be to be to be to be to be to be to be to be to be to be to be to be to be to be to be t',
       'War How Seniale to  to  to  to  to  to  to  to 

In [25]:
np.mean([
4.013622760772705,
3.885849714279175,
4.026196718215942,
3.85374116897583,
3.8427443504333496,
3.981524705886841,
4.048984527587891,
4.043240785598755,
4.0228071212768555,
3.9970905780792236,
4.017746925354004,
])

np.float64(3.9757772142236885)

## Generator Double LSTM

In [26]:
model_double_LSTM = load_model("generator_double_LSTM.keras")
model_double_LSTM.summary()

In [27]:
np.vectorize(lambda x: encode_generate_sequence(model_double_LSTM, x))(first_words[:10])

4.025583982467651
4.039228916168213
4.200617551803589
3.9037890434265137
3.917276620864868
4.038550138473511
4.0751953125
4.085265874862671
4.156601667404175
3.972583770751953
3.9660518169403076


array(['Munich says will not the president to stop to star and the president to stop to star and the president to s',
       'Factbox: Trump to stop Trump says will not the U.S. says will not the U.S. says will not the U.S. says will n',
       'Catalan president of the New York to star and the president of the New York to star and the president of the',
       'newsbud and the president of the New York to star and the president of the New York to star and the presiden',
       'BUSTED: Trump to star and the president of the New York to star and the president of the New York to star an',
       'JAMES AND Obama says Trump says Trump says Trump says Trump says Trump says Trump says Trump says Trump sa',
       'Japan says Trump says Trump says Trump says Trump says Trump says Trump says Trump says Trump says Trump s',
       'What the New York to are Trump says with the New York to are Trump says with the New York to are Trump sa',
       'War the preside the preside the preside the pre

## Generator Attention

In [19]:
model_attention = load_model("generator_attention.keras")
model_attention.summary()

In [38]:
def generate_sequence_attention(model, start_sequence, seq_length, decode, encode):
    encoded = encode(start_sequence)
    if encoded.shape[0] < 124:
        padded_input = np.zeros((124, encoded.shape[1])) 
        padded_input[-encoded.shape[0]:] = encoded  
        current_input = np.copy(padded_input)
    else:
        current_input = np.copy(encoded)
    generated_sequence = start_sequence
    for _ in range(seq_length):
        predictions = model.predict(current_input[np.newaxis, ...], verbose=0)
        next_char_idx = np.argmax(predictions[0, -1, :])
        next_char_onehot = np.zeros_like(current_input[0])
        next_char_onehot[next_char_idx] = 1
        current_input = np.roll(current_input, shift=-1, axis=0)
        current_input[-1] = next_char_onehot  
        generated_sequence += decode(next_char_onehot[np.newaxis, ...])
    return generated_sequence

In [39]:
generated_text = generate_sequence_attention(model_attention, first_words[2], SEQ_SIZE, decode, encode)
print(generated_text)

Did <pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
