# Download some text files

In [1]:
import os
import json
import kagglehub  # pip install kagglehub

# Download files with song lyrics from Kaggle.
# Save them to `lyrics_path`
lyrics_path = kagglehub.dataset_download("paultimothymooney/poetry")
print(f"Data downloaded to {lyrics_path}")

Data downloaded to /home/nick/.cache/kagglehub/datasets/paultimothymooney/poetry/versions/16


In [2]:
lyrics_files = os.listdir(lyrics_path)
print(f"{len(lyrics_files)} .txt files downloaded.")

49 .txt files downloaded.


In [3]:
lyrics_files[0:3]

['r-kelly.txt', 'amy-winehouse.txt', 'adele.txt']

# Convert text files to JSON format

In [4]:
# Create a directory `json_files`
os.makedirs("json_files", exist_ok=True)

In [5]:
for lyrics_file in lyrics_files:

    with open(file=os.path.join(lyrics_path, lyrics_file), mode="r") as f:
        text = f.read()

    temp_dict = [{"text": text}]
    json_path = os.path.join("json_files", lyrics_file)

    json_path = json_path[:-4] + ".json" # remove .txt and add .json
    with open(json_path, 'w') as fp:
        json.dump(temp_dict, fp)

# Create Dataset from JSON files

In [6]:
from llm_trainer import create_dataset_from_json

create_dataset_from_json(save_dir="data",
                         json_dir="json_files",
                         chunk_size=int(1e5),
                         chunk_limit=1500)

Processing Chunks:   1%|          | 18/1500 [00:00<01:12, 20.41chunk/s]


# Train GPT2 model on this data

In [7]:
from transformers import GPT2Config, GPT2LMHeadModel

model = GPT2LMHeadModel(config=GPT2Config(n_positions=256, n_embd=512, n_head=8))

In [None]:
from llm_trainer import LLMTrainer

trainer = LLMTrainer(model=model)
trainer.train(generate_each_n_steps=100, max_steps=200, prompt="Every time we say goodbye,\nI die a little")

step: 0 | Loss: 11.000000 | norm: 14.4398 | lr: 2.000000e-05 | dt: 9.04s | tok/sec: 7251.91
step: 1 | Loss: 10.125000 | norm: 6.9708 | lr: 2.666667e-05 | dt: 1.53s | tok/sec: 42769.15
step: 2 | Loss: 10.000000 | norm: 4.0215 | lr: 3.333333e-05 | dt: 1.53s | tok/sec: 42750.91
step: 3 | Loss: 9.937500 | norm: 3.1104 | lr: 4.000000e-05 | dt: 1.53s | tok/sec: 42806.98
step: 4 | Loss: 9.875000 | norm: 2.6609 | lr: 4.666667e-05 | dt: 1.53s | tok/sec: 42789.87
step: 5 | Loss: 9.937500 | norm: 3.0117 | lr: 5.333333e-05 | dt: 1.53s | tok/sec: 42813.61
step: 6 | Loss: 9.625000 | norm: 3.0445 | lr: 6.000000e-05 | dt: 1.53s | tok/sec: 42874.80
step: 7 | Loss: 9.687500 | norm: 2.3515 | lr: 6.666667e-05 | dt: 1.53s | tok/sec: 42789.31
step: 8 | Loss: 9.687500 | norm: 2.0178 | lr: 7.333333e-05 | dt: 1.53s | tok/sec: 42793.26
step: 9 | Loss: 9.687500 | norm: 5.9197 | lr: 8.000000e-05 | dt: 1.53s | tok/sec: 42825.83
step: 10 | Loss: 9.562500 | norm: 2.3708 | lr: 8.666667e-05 | dt: 1.53s | tok/sec: 4281

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# DISPLAY LOSS
data = pd.read_csv("logs_training.csv")

window_size = 10  # Adjust for more or less smoothing
smoothed_loss = data["Loss"].rolling(window=window_size).mean()

plt.plot(data["Step"], smoothed_loss, label="Smoothed Loss", color="pink")
plt.plot(data["Step"], data["Loss"], alpha=0.5, label="Original Loss", color="gray")

plt.axhline(y=6, color='r', linestyle='--', alpha=0.6)
plt.axhline(y=5, color='gray', linestyle='--', alpha=0.6)
plt.axhline(y=4, color='y', linestyle='--', alpha=0.6)
plt.axhline(y=3, color='g', linestyle='--', alpha=0.6)

plt.xlabel("Step")
plt.ylabel("Loss")
plt.legend()
plt.show()