In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/taylorswiftlyrics/taylor_swift_lyrics.csv


In [2]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"

In [3]:
import keras
from keras import layers, ops, optimizers
from keras.layers import TextVectorization

In [4]:
import string
import random

import tensorflow as tf
import tensorflow.data as tf_data
import tensorflow.strings as tf_strings
import tensorflow_text as tf_text

In [5]:
gpus = tf.config.list_physical_devices('GPU')
if len(gpus) > 1:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        
        # Use both GPUs
        tf.config.set_visible_devices(gpus[:2], 'GPU')
    except RuntimeError as e:
        print(e)
        
strategy = tf.distribute.MirroredStrategy()
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

Number of devices: 2


In [6]:
import sentencepiece as spm
import tempfile
import re

In [7]:
def casual_attention_mask(batch_size, n_dest, n_src, dtype):
    i = ops.arange(n_dest)[:, None]
    j = ops.arange(n_src)
    m = i >= j - n_src + n_dest
    mask = ops.cast(m, dtype)
    mask = ops.reshape(mask, [1, n_dest, n_src])
    mult = ops.concatenate(
        [ops.expand_dims(batch_size, -1), ops.convert_to_tensor([1, 1])], 0
    )
    return ops.tile(mask, mult)

In [8]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads, embed_dim)
        self.ffn = keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim)
        ])
        self.l1 = layers.LayerNormalization(epsilon=1e-6)
        self.l2 = layers.LayerNormalization(epsilon=1e-6)
        self.drop1 = layers.Dropout(rate)
        self.drop2 = layers.Dropout(rate)

    def call(self, inputs):
        input_shape = ops.shape(inputs)
        batch_size, seq_len = input_shape[0], input_shape[1]
        casual_mask = casual_attention_mask(batch_size, seq_len, seq_len, "bool")
        attn_output = self.att(inputs, inputs, attention_mask=casual_mask)
        attn_output = self.drop1(attn_output)
        out1 = self.l1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.drop1(ffn_output)
        return self.l2(out1 + ffn_output)

In [25]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, max_len, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim = embed_dim)
        self.pos_emb = layers.Embedding(input_dim=max_len, output_dim=embed_dim)

    def call(self, inputs):
        max_len = ops.shape(inputs)[-1]
        positions = ops.arange(0, max_len, 1)
        positions = self.pos_emb(positions)
        inputs = self.token_emb(inputs)
        return inputs + positions

In [26]:
vocab_size = 32000
max_len = 324 # Victoria Park
embed_dim = 256
num_heads = 3
ff_dim = 256

In [27]:
def create_model():
    inputs = layers.Input(shape=(None,), dtype="int32")
    embedding = TokenAndPositionEmbedding(max_len, vocab_size, embed_dim)
    transformer = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = embedding(inputs)
    x = transformer(x)
    outputs = layers.Dense(vocab_size)(x)
    model = keras.Model(inputs=inputs, outputs=[outputs, x])
    loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(
        optimizers.Adam(learning_rate=5e-3),
        loss=[loss_fn, None]
    )
    return model

In [10]:
df = pd.read_csv('/kaggle/input/taylorswiftlyrics/taylor_swift_lyrics.csv')

In [11]:
df.head()

Unnamed: 0,Song,Lyrics
0,"""Slut!"" (Taylor's Version) (From The Vault)","[Verse 1]\nFlamingo pink, Sunrise Boulevard\nC..."
1,22 (Taylor's Version),[Verse 1]\nIt feels like a perfect night\nTo d...
2,Afterglow,"[Verse 1]\nI blew things out of proportion, no..."
3,All Too Well (10 Minute Version) (Taylor's Ver...,"[Verse 1]\nI walked through the door with you,..."
4,All Too Well (Taylor's Version),"[Verse 1]\nI walked through the door with you,..."


In [12]:
lyrics_list = df["Lyrics"].dropna().tolist()
random.shuffle(lyrics_list)

In [13]:
contraction_pattern = re.compile(r"\b(?:[A-Za-z]+(?:'ll|'ve|'d|'re|'m|'s|n't))\b")
contractions_in_lyrics = set()
for line in lyrics_list:
    contractions_in_lyrics.update(contraction_pattern.findall(line))
contractions_in_lyrics = sorted(contractions_in_lyrics)

In [14]:
with tempfile.NamedTemporaryFile(delete=False, mode="w", encoding="utf-8") as temp_file:
    temp_file_name = temp_file.name
    for line in lyrics_list:
        temp_file.write(line + "\n")

In [15]:
spm.SentencePieceTrainer.train(
    input=temp_file_name,  
    model_prefix="lyrics_tokenizer",  
    vocab_size=3446,
    character_coverage=1.0,
    user_defined_symbols=contractions_in_lyrics
)

In [16]:
spm_model = spm.SentencePieceProcessor(model_file='lyrics_tokenizer.model')
VOCAB = [spm_model.id_to_piece(i) for i in range(spm_model.vocab_size())]

In [17]:
with open("lyrics_tokenizer.model", "rb") as f:
    model_data = f.read()

vectorizer = tf_text.SentencepieceTokenizer(model=model_data)

In [18]:
test_text = tf.constant(["You're on the phone with your girlfriend"])
sus_text = vectorizer.detokenize(vectorizer.tokenize(test_text))

# Output the tokenized result
print(sus_text.numpy().tolist())

[b"You're on the phone with your girlfriend"]


In [29]:
def prepare_gpt_inputs(text):
    text = tf_strings.as_string(text)
    tokenized = vectorizer.tokenize(text).to_tensor(
        default_value=3500, shape=[None, 768]
    )
    x = tokenized[:, :-1]
    y = tokenized[:, 1:]
    return x, y

In [30]:
text_ds = tf.data.Dataset.from_tensor_slices(lyrics_list)
text_ds = text_ds.shuffle(buffer_size=256)
text_ds = text_ds.batch(128) # batch_size
text_ds = text_ds.map(prepare_gpt_inputs, num_parallel_calls=tf_data.AUTOTUNE)
text_ds = text_ds.prefetch(tf_data.AUTOTUNE)

In [31]:
class SwiftGPT(keras.callbacks.Callback):
    def __init__(self, max_tokens, start_tokens, index_to_word, top_k=10, print_every=5):
        self.max_tokens = max_tokens
        self.start_tokens = start_tokens
        self.index_to_word = index_to_word
        self.print_every = print_every
        self.top_k = top_k

    def sample_from(self, logits):
        logits, indices = ops.top_k(logits, k=self.top_k, sorted=True)
        indices = np.array(indices).astype("int32")
        preds = keras.activations.softmax(ops.expand_dims(logits, 0))[0]
        preds = np.array(preds).astype("float32")
        return np.random.choice(indices, p=preds)

    def detokenize(self, number):
        return self.index_to_word[number]

    def on_epoch_end(self, epoch, logs=None):
        start_tokens = [_ for _ in self.start_tokens]
        if (epoch + 1) % self.print_every != 0: return
        num_tokens_generated, tokens_generated = 0, []
        
        while num_tokens_generated <= self.max_tokens:
            pad_len = max_len - len(start_tokens)
            sample_index = len(start_tokens) - 1
            if pad_len < 0:
                x = start_tokens[:maxlen]
                sample_index = maxlen - 1
            elif pad_len > 0: x = start_tokens + [0] * pad_len
            else: x = start_tokens
            x = np.array([x])
            y, _ = self.model.predict(x, verbose=0)
            sample_token = self.sample_from(y[0][sample_index])
            tokens_generated.append(sample_token)
            start_tokens.append(sample_token)
            num_tokens_generated = len(tokens_generated)

        stuff = " ".join([self.detokenize(_) for _ in self.start_tokens + tokens_generated])
        print(stuff)
        return

In [32]:
# Tokenizer the starting prompt
word_to_index = {}
for index, word in enumerate(VOCAB):
    word_to_index[word] = index

In [33]:
start_prompt = "You're on the phone with your girl friend she's upset"
start_tokens = [word_to_index.get(_, 1) for _ in start_prompt.split()]
num_tokens_generated = 50
callback = SwiftGPT(num_tokens_generated, start_tokens, VOCAB)

In [34]:
with strategy.scope():
    MiniGPT1 = create_model()
    MiniGPT1.fit(text_ds, verbose=2, epochs=20, callbacks=[callback])



Epoch 1/20
1/1 - 5s - 5s/step - loss: 10.4322
Epoch 2/20
1/1 - 1s - 1s/step - loss: 9.4074
Epoch 3/20
1/1 - 1s - 1s/step - loss: 7.7502
Epoch 4/20
1/1 - 1s - 1s/step - loss: 6.6525
Epoch 5/20
You're on the <s> with your <s> <s> she's <s> ▁I ▁it ▁me ) ▁the ) ▁my ▁( ▁my ) ▁( ▁( ▁ ▁ ▁on ▁ ) ▁you ▁ ) ▁( ▁the ▁ ▁on ▁it ▁ ▁ ▁me ▁( ▁ ▁( ▁ ▁( ) ▁my ▁it ▁the ▁ ▁my ▁you ▁I ▁me ▁the ▁on ▁on ) ▁ ▁I ▁( ) ▁my
1/1 - 14s - 14s/step - loss: 5.6831
Epoch 6/20
1/1 - 2s - 2s/step - loss: 5.2002
Epoch 7/20
1/1 - 1s - 1s/step - loss: 4.6741
Epoch 8/20
1/1 - 1s - 1s/step - loss: 4.5158
Epoch 9/20
1/1 - 1s - 1s/step - loss: 4.4536
Epoch 10/20
You're on the <s> with your <s> <s> she's <s> ▁you ▁ , ▁to ▁you ▁you - ] ▁I ▁the ▁I ▁I ▁to ] ▁to ▁I ▁the ▁ ▁ ▁I ] ▁ ▁ Chorus , , , ▁[ ▁the ▁I ▁I ▁you ] , ▁I Chorus , ▁I ▁I ▁you , ▁I , ▁you ▁ ] ▁I ▁ Chorus ▁the ,
1/1 - 13s - 13s/step - loss: 4.4814
Epoch 11/20
1/1 - 2s - 2s/step - loss: 4.2899
Epoch 12/20
1/1 - 1s - 1s/step - loss: 4.4265
Epoch 13/20
1/1 - 1s - 1s/step - 