In [1]:
import pandas as pd

In [2]:
import csv
import pandas as pd

# Define the path to the CSV file
csv_file_path = 'shortjokes.csv'

# Open the CSV file and create a list of rows excluding the problematic row
rows_to_keep = []
with open(csv_file_path, 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    for row_num, row in enumerate(reader, start=1):
        if row_num != 141577:  # Exclude the problematic row
            rows_to_keep.append(row)

# Write the cleaned rows to a new CSV file
cleaned_csv_file_path = 'cleaned_shortjokes.csv'
with open(cleaned_csv_file_path, 'w', encoding='utf-8', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(rows_to_keep)

# Now, read the cleaned CSV file using pandas
jokes_df = pd.read_csv(cleaned_csv_file_path)


In [3]:
jokes_df.head()

Unnamed: 0,ID,Joke
0,1,"[me narrating a documentary about narrators] ""..."
1,2,Telling my daughter garlic is good for you. Go...
2,3,I've been going through a really rough period ...
3,4,"If I could have dinner with anyone, dead or al..."
4,5,Two guys walk into a bar. The third guy ducks.


In [4]:
text = '\n'.join(jokes_df['Joke'][:5000])

In [5]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [6]:
tokenizer = Tokenizer()

In [7]:
tokenizer.fit_on_texts([text])

In [8]:
input_sequences = []
for sentence in text.split('\n'):
  tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]

  for i in range(1,len(tokenized_sentence)):
    input_sequences.append(tokenized_sentence[:i+1])
input_sequences

[[14, 4924],
 [14, 4924, 1],
 [14, 4924, 1, 2470],
 [14, 4924, 1, 2470, 46],
 [14, 4924, 1, 2470, 46, 4925],
 [14, 4924, 1, 2470, 46, 4925, 3],
 [14, 4924, 1, 2470, 46, 4925, 3, 83],
 [14, 4924, 1, 2470, 46, 4925, 3, 83, 118],
 [14, 4924, 1, 2470, 46, 4925, 3, 83, 118, 11],
 [14, 4924, 1, 2470, 46, 4925, 3, 83, 118, 11, 136],
 [14, 4924, 1, 2470, 46, 4925, 3, 83, 118, 11, 136, 335],
 [14, 4924, 1, 2470, 46, 4925, 3, 83, 118, 11, 136, 335, 1659],
 [14, 4924, 1, 2470, 46, 4925, 3, 83, 118, 11, 136, 335, 1659, 35],
 [14, 4924, 1, 2470, 46, 4925, 3, 83, 118, 11, 136, 335, 1659, 35, 348],
 [429, 8],
 [429, 8, 555],
 [429, 8, 555, 4926],
 [429, 8, 555, 4926, 10],
 [429, 8, 555, 4926, 10, 102],
 [429, 8, 555, 4926, 10, 102, 18],
 [429, 8, 555, 4926, 10, 102, 18, 5],
 [429, 8, 555, 4926, 10, 102, 18, 5, 102],
 [429, 8, 555, 4926, 10, 102, 18, 5, 102, 4927],
 [429, 8, 555, 4926, 10, 102, 18, 5, 102, 4927, 1448],
 [429, 8, 555, 4926, 10, 102, 18, 5, 102, 4927, 1448, 6],
 [429, 8, 555, 4926, 10, 

In [9]:
max_len_for_padding = max([len(x) for x in input_sequences])
max_len_for_padding

43

In [10]:
from tensorflow.keras.utils import pad_sequences

In [11]:
padded_sequences = pad_sequences(input_sequences, maxlen = max_len_for_padding, padding='pre')
padded_sequences

array([[    0,     0,     0, ...,     0,    14,  4924],
       [    0,     0,     0, ...,    14,  4924,     1],
       [    0,     0,     0, ...,  4924,     1,  2470],
       ...,
       [    0,     0,     0, ...,  1307,  4300,   888],
       [    0,     0,     0, ...,  4300,   888,   918],
       [    0,     0,     0, ...,   888,   918, 10500]], dtype=int32)

In [12]:
X = padded_sequences[ : , :-1]
y = padded_sequences[ : , -1]

In [13]:
X

array([[   0,    0,    0, ...,    0,    0,   14],
       [   0,    0,    0, ...,    0,   14, 4924],
       [   0,    0,    0, ...,   14, 4924,    1],
       ...,
       [   0,    0,    0, ...,    8, 1307, 4300],
       [   0,    0,    0, ..., 1307, 4300,  888],
       [   0,    0,    0, ..., 4300,  888,  918]], dtype=int32)

In [14]:
y

array([ 4924,     1,  2470, ...,   888,   918, 10500], dtype=int32)

In [15]:
X.shape

(82904, 42)

In [16]:
y.shape

(82904,)

In [17]:
total_words = len(tokenizer.word_index)

In [18]:
from keras.utils import to_categorical

In [19]:
y = to_categorical(y, num_classes = total_words+1)
y.shape

(82904, 10501)

In [20]:
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential

In [21]:
model = Sequential()
model.add(Embedding(total_words+1, 100, input_length = max_len_for_padding-1))
model.add(LSTM(150))
model.add(Dense(total_words+1, activation = 'softmax'))

In [22]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 42, 100)           1050100   
                                                                 
 lstm (LSTM)                 (None, 150)               150600    
                                                                 
 dense (Dense)               (None, 10501)             1585651   
                                                                 
Total params: 2786351 (10.63 MB)
Trainable params: 2786351 (10.63 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [25]:
model.fit(X, y, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50

KeyboardInterrupt: 

In [26]:
import numpy as np

In [27]:
n = 100
test_text = "I"
for i in range(n):
  tokenized_text = tokenizer.texts_to_sequences([test_text])[0]
  padded_tok_text = pad_sequences([tokenized_text], maxlen=max_len_for_padding-1, padding='pre')
  word_index = np.argmax(model.predict(padded_tok_text))
  for word, index in tokenizer.word_index.items():
    if index == word_index:
      test_text = test_text + " " + word
      break
print(test_text)

I like my women like i like my coffee i like i like my slaves free scared of my pants the other day i live for the other day they got me son nothing and i'm true i don't even get the same thing so i want to signal my roommate that i had no idea how to fold laundry and they call other you mean and i said i suppose you'll be buying them dinner or your doctor said i can do you want it in a hi fresh in the bed with my math family is not like dan


In [28]:
import pickle

with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

In [29]:
with open("model.pkl", "rb") as f:
    model = pickle.load(f)