In [102]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional


In [103]:
# opening eminem lyrics data set
def open_file(filename):
    with open(filename) as f:
        data = f.read()

    # Convert to lower case and save as a list
        corpus = data.lower().split("\n")

    return corpus

In [104]:
filename = "eminem_Lyrics.txt"
corpus = open_file(filename)

In [105]:
print(f"There are {len(corpus)} lines of lyrics\n")
print(f"The first 5 lines look like this:\n")
for i in range(5):
  print(corpus[i])

There are 507 lines of lyrics

The first 5 lines look like this:

now this shit's about to kick off this party looks wack
let's take it back to straight hip-hop and start it from scratch
i'm 'bout to bloody this track up, everybody get back
that's why my pen needs a pad cause my rhymes on the rage
just like i did with addiction i'm 'bout to kick it


In [106]:
# Initialize the Tokenizer class
tokenizer = Tokenizer()

# Generate the word index dictionary
tokenizer.fit_on_texts(corpus)

# Define the total words. You add 1 for the index `0` which is just the padding token.
total_words = len(tokenizer.word_index) + 1

print(f'word index dictionary: {tokenizer.word_index}')
print(f'total words: {total_words}')

word index dictionary: {'i': 1, 'a': 2, 'the': 3, 'you': 4, 'to': 5, 'and': 6, 'it': 7, "i'm": 8, 'my': 9, 'me': 10, 'in': 11, 'that': 12, 'like': 13, 'but': 14, 'your': 15, 'of': 16, 'doing': 17, 'get': 18, 'on': 19, 'so': 20, 'this': 21, 'with': 22, 'be': 23, "don't": 24, 'just': 25, 'go': 26, 'can': 27, 'pee': 28, 'as': 29, "you're": 30, 'back': 31, 'for': 32, 'make': 33, 'is': 34, 'got': 35, 'not': 36, 'know': 37, 'they': 38, "it's": 39, 'what': 40, 'way': 41, "ain't": 42, 'out': 43, 'say': 44, "'cause": 45, 'up': 46, 'rap': 47, 'was': 48, 'who': 49, 'never': 50, 'from': 51, 'when': 52, 'all': 53, 'at': 54, 'think': 55, 'now': 56, 'still': 57, 'down': 58, 'baby': 59, 'fuck': 60, 'if': 61, 'd': 62, 'are': 63, 'oh': 64, 'man': 65, 'see': 66, 'do': 67, 'off': 68, 'bitch': 69, 'girl': 70, 'have': 71, "lookin'": 72, 'boy': 73, 'everybody': 74, 'he': 75, 'about': 76, 'take': 77, "'em": 78, 'mean': 79, 'how': 80, 'feel': 81, 'am': 82, 'her': 83, 'really': 84, "i'll": 85, 'ass': 86, "that'

In [107]:
# Initialize the sequences list
input_sequences = []

# Loop over every line
for line in corpus:

	# Tokenize the current line
	token_list = tokenizer.texts_to_sequences([line])[0]

	# Loop over the line several times to generate the subphrases
	for i in range(1, len(token_list)):

		# Generate the subphrase
		n_gram_sequence = token_list[:i+1]

		# Append the subphrase to the sequences list
		input_sequences.append(n_gram_sequence)

# Get the length of the longest line
max_sequence_len = max([len(x) for x in input_sequences])

# Pad all sequences
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Create inputs and label by splitting the last token in the subphrases
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]

# Convert the label into one-hot arrays
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [108]:
# Get sample sentence
sentence = corpus[0].split()
print(f'sample sentence: {sentence}')

# Initialize token list
token_list = []

# Look up the indices of each word and append to the list
for word in sentence:
  token_list.append(tokenizer.word_index[word])

# Print the token list
print(token_list)

sample sentence: ['now', 'this', "shit's", 'about', 'to', 'kick', 'off', 'this', 'party', 'looks', 'wack']
[56, 21, 438, 76, 5, 152, 68, 21, 439, 261, 262]


In [115]:
from keras.layers import Dropout

# Hyperparameters
embedding_dim = 100
lstm_units = 20
learning_rate = 0.01

# Build the model
model = Sequential([
          Embedding(total_words, embedding_dim, input_length=max_sequence_len-1),
          Bidirectional(LSTM(lstm_units)),
		  Dropout(0.4),
		  Dense(728, activation='relu'),
          Dense(total_words, activation='softmax')
])

# Use categorical crossentropy because this is a multi-class problem
model.compile(
    loss='categorical_crossentropy',
    optimizer='Adam',
    metrics=['accuracy']
    )

# Print the model summary
model.summary()

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_14 (Embedding)    (None, 17, 100)           135300    
                                                                 
 bidirectional_11 (Bidirecti  (None, 40)               19360     
 onal)                                                           
                                                                 
 dropout_10 (Dropout)        (None, 40)                0         
                                                                 
 dense_21 (Dense)            (None, 728)               29848     
                                                                 
 dense_22 (Dense)            (None, 1353)              986337    
                                                                 
Total params: 1,170,845
Trainable params: 1,170,845
Non-trainable params: 0
___________________________________________

In [116]:
epochs = 80

# Train the model
history = model.fit(xs, ys, epochs=epochs)

Epoch 1/80


2022-05-13 03:01:27.845342: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-05-13 03:01:28.320124: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-05-13 03:01:28.331598: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-05-13 03:01:29.182004: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-05-13 03:01:29.202421: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


In [117]:
seed_text = "Girl i like you and you like me"
# Define total words to predict
next_words = 100


for _ in range(next_words):
	# Convert the text into sequences
	token_list = tokenizer.texts_to_sequences([seed_text])[0]
	# Pad the sequences
	token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
	# Get the probabilities of predicting a word
	predicted = model.predict(token_list, verbose=0)
	# Choose the next word based on the maximum probability
	predicted = np.argmax(predicted, axis=-1).item()
	# Get the actual word from the word index
	output_word = tokenizer.index_word[predicted]
	# Append to the current text
	seed_text += " " + output_word

print(seed_text)

2022-05-13 03:06:43.170959: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-05-13 03:06:43.352711: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-05-13 03:06:43.368602: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Girl i like you and you like me seen a ass like that i'm pee wee herman this movie's pg pupil sprayin' lead a rage and got slim but enough this music you make like that hard that hard that induct them that old's rage and pay them little attention yack stunts blew blew blew so make songs like my filthy mouth out my woo woo woo now break it was clear to me clear to me fuckin' the m m m m m m m mathers marshall mathers did that was so jennifer's in the rage and pay homage lyrical lyrical acrobat stunts while i was but
