In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('/kaggle/input/cleaned/cleaned_classification.csv')
df.drop(columns=['Unnamed: 0','labels'], inplace=True)
total_rows = len(df)
df = df.iloc[total_rows // 2:]
df = df.dropna()
df

Unnamed: 0,poem
7146,lonely market dawn sun begins rise slowly make...
7147,street every town owns house called lonely pas...
7148,ran towards face huge glee looked mummy plead ...
7149,lonely heart sets table milly betty mabel tea ...
7150,im field roaming vast prairie waiting somebody...
...,...
14287,may roam world like child feast sips sweet fli...
14288,born world everyone laughed cried conduct mann...
14289,cast look around world marvel see hear ponder ...
14290,world take good notice silver stars fading mil...


In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer 

def tokenizer_sequence(df, training_length):
    
    df = df[df["poem"].str.count(' ') >= (training_length * 2 - 1)]
    
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df["poem"])
    num_words = len(tokenizer.index_word) + 1
    idx_word = tokenizer.index_word
    
    sequences = tokenizer.texts_to_sequences(df["poem"])
    training_seq = []
    labels = []

    for seq in sequences:
        for i in range(training_length, len(seq)):
            extract = seq[i - training_length:i + 1]
            training_seq.append(extract[:-1])
            labels.append(extract[-1])
            
    print(f'There are {len(training_seq)} training sequences.')    
        
    return training_seq, labels, num_words, sequences, idx_word

2024-05-15 08:52:14.025143: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-15 08:52:14.025287: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-15 08:52:14.168477: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
training_seq, labels, num_words, sequences, idx_word = tokenizer_sequence(df, 50)

There are 346440 training sequences.


In [4]:
from sklearn.utils import shuffle

def create_train_valid(X, y, num_words, factor):
    
    X, y = shuffle(X, y, random_state=42)
    
    train_end = int(factor * len(y))
    
    X_train = np.array(X[:train_end])
    X_test = np.array(X[train_end:])
    
    labels_train = y[:train_end]
    labels_valid = y[train_end:]
    
    #X_train, X_valid = np.array(train_features), np.array(valid_features)
    
    y_train = np.zeros((len(labels_train), num_words), dtype=np.int8)
    y_test = np.zeros((len(labels_valid), num_words), dtype=np.int8)

    for example_index, word_index in enumerate(labels_train):
        y_train[example_index, word_index] = 1

    for example_index, word_index in enumerate(labels_valid):
        y_test[example_index, word_index] = 1
        
    # Memory management
    import gc
    gc.enable()
    del X, y, labels_train, labels_valid
    gc.collect()
    
    return X_train, X_test, y_train, y_test

In [5]:
X_train, X_test, y_train, y_test = create_train_valid(training_seq, labels, num_words, 0.6)
print(X_train.shape)
print(y_train.shape)

(207864, 50)
(207864, 41629)


In [6]:
import sys
def check_sizes(gb_min=1):
    for x in globals():
        size = sys.getsizeof(eval(x)) / 1e9
        if size > gb_min:
            print(f'Object: {x:10}\tSize: {size} GB.')


check_sizes(gb_min=1)

Object: y_train   	Size: 8.653170584 GB.
Object: y_test    	Size: 5.768780432 GB.


In [7]:
import pickle

def save_intermediate_results(data):
    for i in data:
        with open(f'{i}.pkl','wb') as f:
            pickle.dump(globals()[i], f)
            
data=['num_words', 'sequences', 'idx_word', 'training_seq', 'labels','X_test','y_test']
save_intermediate_results(data)

In [8]:
import gc
gc.enable()
del (sequences, labels ,X_test,y_test, idx_word)
gc.collect()

0

In [9]:
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout, Embedding, Masking, Bidirectional
from keras.optimizers import Adam

from keras.utils import plot_model

In [10]:
def make_word_level_model(num_words,lstm_cells=128,trainable=True,lstm_layers=1,bi_direc=False):

    model = Sequential()
    model.add(Embedding(input_dim=num_words,output_dim=100, input_length=50,trainable=True))
    model.add(Bidirectional(LSTM(lstm_cells,return_sequences=False,dropout=0.1,recurrent_dropout=0.1)))
    model.add(Dense(num_words, activation='softmax'))
    
    model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
    
    return model

In [11]:
rnn = make_word_level_model(41629,trainable=True,lstm_layers=1,bi_direc=True)
rnn.build(X_train.shape)
rnn.summary()



In [13]:
history = rnn.fit(
    X_train,
    y_train,
    epochs=150,
    batch_size=2048,
    verbose=1)

Epoch 1/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 290ms/step - accuracy: 0.0193 - loss: 9.7315
Epoch 2/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 290ms/step - accuracy: 0.0198 - loss: 8.5508
Epoch 3/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 287ms/step - accuracy: 0.0358 - loss: 8.4209
Epoch 4/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 288ms/step - accuracy: 0.0424 - loss: 8.3235
Epoch 5/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 288ms/step - accuracy: 0.0484 - loss: 8.2136
Epoch 6/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 289ms/step - accuracy: 0.0509 - loss: 8.0850
Epoch 7/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 288ms/step - accuracy: 0.0519 - loss: 7.9550
Epoch 8/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 288ms/step - accuracy: 0.0540 - loss: 7.8072
Epoch 9/

In [14]:
from IPython.display import HTML
import re

def header(text, color='black'):
    raw_html = f'<h1 style="color: {color};"><center>' + \
        str(text) + '</center></h1>'
    return raw_html


def box(text):
    raw_html = '<div style="border:1px inset black;padding:1em;font-size: 20px;">' + \
        str(text)+'</div>'
    return raw_html


def addContent(old_html, raw_html):
    old_html += raw_html
    return old_html

def remove_spaces(patent):
    patent = re.sub(r'\s+([.,;?])', r'\1', patent)
    return patent

def load_values(data):
    val=[]
    for i in data:
        
        with open(f'{i}.pkl','rb') as f:
            globals()[f'{i}'] = pickle.load(f)
            print(type(globals()[f'{i}']))
            val.append(globals()[f'{i}'])
    return val

In [15]:
globals()['num_words', 'sequences', 'idx_word', 'training_seq', 'labels','X_test','y_test'] = load_values(data)

<class 'int'>
<class 'list'>
<class 'dict'>
<class 'list'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [16]:
import random


def generate_output(model,sequences,training_length=50,new_words=50,diversity=1,return_output=False,n_gen=1):


    seq = random.choice(sequences)
    seed_idx = random.randint(0, len(seq) - training_length - 10)
    end_idx = seed_idx + training_length

    gen_list = []

    for n in range(n_gen):
        seed = seq[seed_idx:end_idx]
        original_sequence = [idx_word[i] for i in seed]
        generated = seed[:] + ['#']

        actual = generated[:] + seq[end_idx:end_idx + new_words]


        for i in range(new_words):
            preds = model.predict(np.array(seed).reshape(1, -1), verbose=0)[0].astype(np.float64)
            preds = np.log(preds) / diversity
            exp_preds = np.exp(preds)

            # Softmax
            preds = exp_preds / sum(exp_preds)

            probas = np.random.multinomial(1, preds, 1)[0]
            next_idx = np.argmax(probas)
            seed = seed[1:] + [next_idx]
            generated.append(next_idx)
            
        n = []

        for i in generated:
            n.append(idx_word.get(i, '< --- >'))
            
        gen_list.append(n)

    a = []

    for i in actual:
        a.append(idx_word.get(i, '< --- >'))

    a = a[training_length:]

    gen_list = [
        gen[training_length:training_length + len(a)] for gen in gen_list
    ]

    if return_output:
        return original_sequence, gen_list, a

    # HTML formatting
    seed_html = ''
    seed_html = addContent(seed_html, header(
        'Seed Sequence', color='darkblue'))
    seed_html = addContent(seed_html,
                           box(remove_spaces(' '.join(original_sequence))))

    gen_html = ''
    gen_html = addContent(gen_html, header('RNN Generated', color='darkred'))
    gen_html = addContent(gen_html, box(remove_spaces(' '.join(gen_list[0]))))

    a_html = ''
    a_html = addContent(a_html, header('Actual', color='darkgreen'))
    a_html = addContent(a_html, box(remove_spaces(' '.join(a))))

    return seed_html, gen_html, a_html

In [17]:
seed_html, gen_html, a_html = generate_output(rnn, sequences,50)

In [18]:
HTML(seed_html)

In [19]:
HTML(gen_html)

In [20]:
HTML(a_html)