In [1]:
import pandas as pd
import os
import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

2024-04-06 19:55:38.627231: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
jokes_df = pd.read_csv('jokes.csv')
jokes_df.head()

Unnamed: 0,ID,Joke
0,1,"[me narrating a documentary about narrators] ""..."
1,2,Telling my daughter garlic is good for you. Go...
2,3,I've been going through a really rough period ...
3,4,"If I could have dinner with anyone, dead or al..."
4,5,Two guys walk into a bar. The third guy ducks.


In [3]:
print("Number of records: ", jokes_df.shape[0])
print("Number of fields: ", jokes_df.shape[1])

Number of records:  100000
Number of fields:  2


In [4]:
jokes_df['Joke']


0        [me narrating a documentary about narrators] "...
1        Telling my daughter garlic is good for you. Go...
2        I've been going through a really rough period ...
3        If I could have dinner with anyone, dead or al...
4           Two guys walk into a bar. The third guy ducks.
                               ...                        
99995    Every time I walk into a singles bar I can hea...
99996    how wide is the universe? how long is a piece ...
99997    A man goes to a halloween party wearing nothin...
99998                           I don't Bolivia Peru-v it.
99999    What's the world's longest Ted Talk? How I Met...
Name: Joke, Length: 100000, dtype: object

In [5]:
jokes_df['Joke'] = jokes_df['Joke'].apply(lambda x: x.replace(u'\xa0',u' '))
jokes_df['Joke'] = jokes_df['Joke'].apply(lambda x: x.replace('\u200a',' '))

![alt text](Tokenization.png "Title")


In [7]:
tokenizer = Tokenizer(oov_token='<oov>') # For those words which are not found in word_index
tokenizer.fit_on_texts(jokes_df['Joke'])
total_words = len(tokenizer.word_index) + 1

print("Total number of words: ", total_words)
print("Word: ID")
print("------------")
print("<oov>: ", tokenizer.word_index['<oov>'])
print("Strong: ", tokenizer.word_index['strong'])
print("And: ", tokenizer.word_index['and'])
print("Consumption: ", tokenizer.word_index['consumption'])

Total number of words:  46924
Word: ID
------------
<oov>:  1
Strong:  1370
And:  7
Consumption:  10339


In [8]:
input_sequences = []
for line in jokes_df['Joke']:
    token_list = tokenizer.texts_to_sequences([line])[0]
    #print(token_list)
    
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# print(input_sequences)
print("Total input sequences: ", len(input_sequences))

Total input sequences:  1660051


In [9]:
# pad sequences 
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
input_sequences[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
         15, 8702,    2], dtype=int32)

In [10]:
xs, labels = input_sequences[:, :-1], input_sequences[:, -1]


In [14]:
# Assuming total_words, max_sequence_len, xs, and ys are already defined
model = Sequential()
model.add(Embedding(total_words, 2))  # Removed input_length argument
model.add(Bidirectional(LSTM(64)))
model.add(Dense(total_words, activation='softmax'))
adam = Adam(learning_rate=0.01)  # Updated learning rate setting
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(xs, labels, epochs=1, batch_size=128)  # Adjust epochs and batch_size as needed

# Print model summary
model.summary()

2024-04-06 17:21:50.769813: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 378491628 exceeds 10% of free system memory.


In [None]:
print(xs.shape)  # Should match the input shape expected by your model
print(labels.shape)  # Should be (None,) where "None" is your dataset size


(1660051, 57)
(1660051,)


In [None]:
def data_generator(sequence_data, batch_size):
    """Yields batches of input sequences and labels."""
    num_batches = len(sequence_data) // batch_size
    
    while True: # Loop forever, so the generator never terminates
        for batch_idx in range(num_batches):
            start = batch_idx * batch_size
            end = start + batch_size
            sequences = sequence_data[start:end]
            xs, labels = sequences[:, :-1], sequences[:, -1]
            
            # No need to one-hot encode labels if using SparseCategoricalCrossentropy
            yield xs, labels

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')


NameError: name 'model' is not defined