In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/taylorswiftlyrics/taylor_swift_lyrics.csv
/kaggle/input/gpt2/keras/gpt2_base_en/2/config.json
/kaggle/input/gpt2/keras/gpt2_base_en/2/tokenizer.json
/kaggle/input/gpt2/keras/gpt2_base_en/2/metadata.json
/kaggle/input/gpt2/keras/gpt2_base_en/2/model.weights.h5
/kaggle/input/gpt2/keras/gpt2_base_en/2/assets/tokenizer/merges.txt
/kaggle/input/gpt2/keras/gpt2_base_en/2/assets/tokenizer/vocabulary.json


In [2]:
os.environ["KERAS_BACKEND"] = "tensorflow"

import keras
import keras_hub
import tensorflow as tf
import tensorflow.data as tf_data

In [3]:
from keras_hub.models import GPT2CausalLMPreprocessor, GPT2CausalLM
from keras_hub.samplers import TopPSampler

In [4]:
Preprocessor = GPT2CausalLMPreprocessor.from_preset(
    "gpt2_base_en",
    sequence_length = 512,
)

In [5]:
Sampler = TopPSampler(
    p = 0.9,
    k = None,
    seed = 960,
    temperature = 0.8
)

In [6]:
GPT2Model = GPT2CausalLM.from_preset(
    "gpt2_base_en",
    preprocessor=Preprocessor
)

We will fine-tune this LLM to generate Taylor Swift-esque lyrics

In [7]:
import re
remove_tags = lambda text: re.sub(r'\[.*?\]', '', text)

In [8]:
lyrics = pd.read_csv('/kaggle/input/taylorswiftlyrics/taylor_swift_lyrics.csv').dropna()

In [9]:
lyrics["Lyrics"] = lyrics["Lyrics"].apply(remove_tags)

In [10]:
dataset = tf_data.Dataset.from_tensor_slices(lyrics["Lyrics"].values)

In [11]:
dataset = dataset.batch(16).cache().prefetch(tf_data.AUTOTUNE)

In [12]:
from keras.optimizers import Adam
from keras.optimizers.schedules import PolynomialDecay
from keras.losses import SparseCategoricalCrossentropy

In [13]:
scheduler = PolynomialDecay(
    5e-5,
    decay_steps=dataset.cardinality() * 10,
    end_learning_rate=0.0,
)

In [14]:
loss = SparseCategoricalCrossentropy(from_logits=True)

In [15]:
GPT2Model.compile(
    optimizer=Adam(scheduler),
    loss=loss,
    weighted_metrics=["accuracy"],
    sampler=Sampler,
)

In [16]:
GPT2Model.summary()

In [17]:
GPT2Model.fit(dataset, epochs=20)

Epoch 1/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 6s/step - accuracy: 0.5018 - loss: 2.4256
Epoch 2/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 2s/step - accuracy: 0.5315 - loss: 2.1940
Epoch 3/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 2s/step - accuracy: 0.5435 - loss: 2.1098
Epoch 4/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 2s/step - accuracy: 0.5504 - loss: 2.0524
Epoch 5/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 2s/step - accuracy: 0.5560 - loss: 2.0135
Epoch 6/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 2s/step - accuracy: 0.5598 - loss: 1.9820
Epoch 7/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 2s/step - accuracy: 0.5630 - loss: 1.9551
Epoch 8/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 2s/step - accuracy: 0.5660 - loss: 1.9387
Epoch 9/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x7ecddc21f790>

In [18]:
test = GPT2Model.generate(
    "we were both young when I first met you,",
    max_length = 116,
)

print(test)

we were both young when I first met you, we had a real good time, we had everything together
you know how I felt, you know what I wanted
it was only one of those times when we were supposed to be friends
I know you're right, I know I'm right, I'm right, I'm right

so, I know, you're right, I know I'm right, I know I'm right, I'm right

we met in a bar in Manhattan
we dated for the first time
we were the kind of girls
