In [1]:
%pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import os
import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

2024-04-12 17:17:14.168272: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
jokes_df = pd.read_csv('jokes.csv')
jokes_df.head()

Unnamed: 0,ID,Joke
0,1,"[me narrating a documentary about narrators] ""..."
1,2,Telling my daughter garlic is good for you. Go...
2,3,I've been going through a really rough period ...
3,4,"If I could have dinner with anyone, dead or al..."
4,5,Two guys walk into a bar. The third guy ducks.


In [4]:
print("Number of records: ", jokes_df.shape[0])
print("Number of fields: ", jokes_df.shape[1])

Number of records:  231657
Number of fields:  2


In [5]:
jokes_df['Joke']


0         [me narrating a documentary about narrators] "...
1         Telling my daughter garlic is good for you. Go...
2         I've been going through a really rough period ...
3         If I could have dinner with anyone, dead or al...
4            Two guys walk into a bar. The third guy ducks.
                                ...                        
231652                  The Spicy Sausage by Delia Katessen
231653    TIL That I Shouldn't have gone to law school, ...
231654    What did the RAM stick say to the politician? ...
231655    what do you call a play about victorian era me...
231656    Calculus should be taught in every high school...
Name: Joke, Length: 231657, dtype: object

In [6]:

jokes_df['Joke'] = jokes_df['Joke'].apply(lambda x: x.replace(u'\xa0',u' '))
jokes_df['Joke'] = jokes_df['Joke'].apply(lambda x: x.replace('\u200a',' '))

In [7]:
tokenizer = Tokenizer(oov_token='<oov>') # For those words which are not found in word_index
tokenizer.fit_on_texts(jokes_df['Joke'])
total_words = len(tokenizer.word_index) + 1

print("Total number of words: ", total_words)
print("Word: ID")
print("------------")
print("<oov>: ", tokenizer.word_index['<oov>'])
print("Strong: ", tokenizer.word_index['strong'])
print("And: ", tokenizer.word_index['and'])
print("Consumption: ", tokenizer.word_index['consumption'])

Total number of words:  70650
Word: ID
------------
<oov>:  1
Strong:  1479
And:  7
Consumption:  9242


In [8]:
input_sequences = []
for line in jokes_df['Joke']:
    token_list = tokenizer.texts_to_sequences([line])[0]
    #print(token_list)
    
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# print(input_sequences)
print("Total input sequences: ", len(input_sequences))

Total input sequences:  3850485


In [9]:
# pad sequences 
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
input_sequences[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,   15, 9606,    2], dtype=int32)

In [10]:
# create features and label
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]


In [None]:
# Assuming total_words, max_sequence_len, xs, and ys are already defined
model = Sequential()
model.add(Embedding(total_words, 100))  # Removed input_length argument
model.add(Bidirectional(LSTM(64)))
model.add(Dense(total_words, activation='softmax'))
adam = Adam(learning_rate=0.01)  # Updated learning rate setting
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(xs, labels, epochs=3, batch_size=128)  

Epoch 1/3


2024-04-12 17:18:05.455255: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1416978480 exceeds 10% of free system memory.


[1m30082/30082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12126s[0m 403ms/step - accuracy: 0.1281 - loss: 6.2899
Epoch 2/3
[1m30082/30082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11904s[0m 396ms/step - accuracy: 0.2034 - loss: 5.3186
Epoch 3/3
[1m30082/30082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11956s[0m 397ms/step - accuracy: 0.2245 - loss: 5.0806


<keras.src.callbacks.history.History at 0x7fd8c0cfcb50>

In [27]:
from keras.preprocessing.sequence import pad_sequences
import numpy as np

# Example: Starting with two words
seed_text = "Donald trump"

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
    predicted_probs = model.predict(token_list, verbose=0)
    predicted_index = np.argmax(predicted_probs, axis=-1)[0]
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted_index:
            output_word = word
            break
    seed_text += " " + output_word
    if output_word in ['.', '!', '?']:
        break

print(seed_text)


Donald trump is the best way to get a joke about the new barbie doll on the market
