# Settings

Mount Google Drive to `/content/drive`.

In [None]:
from google.colab import drive

drive.mount('/content/drive')


   Modify the "DRIVE_PATH" variable with the location of the 'models', 'checkpoint', and 'samples' folders.

In [None]:
# Use me to explorer your Google Drive.

!ls -lha "/content/drive/MyDrive/RPGMaker2005 (Group 4)/training"

In [None]:
import os

# Change me to the location of 'models', 'checkpoint', and 'samples'.
DRIVE_PATH = "/content/drive/MyDrive/RPGMaker2005 (Group 4)/training"

# Change me to your desired dataset text file.
TRAINING_DATA = os.path.join(DRIVE_PATH, "new-dataset.txt")

ENV_SETTINGS = {}
ENV_SETTINGS["DRIVE_PATH"] = DRIVE_PATH
ENV_SETTINGS["MODELS_PATH"] = os.path.join(DRIVE_PATH, "models")
ENV_SETTINGS["CHECKPOINT_PATH"] = os.path.join(DRIVE_PATH, "checkpoint")
ENV_SETTINGS["SAMPLES_PATH"] = os.path.join(DRIVE_PATH, "samples")
ENV_SETTINGS["TRAINING_DATA"] = TRAINING_DATA

def setEnviron(envDict: dict):
  for env in envDict:
    os.environ[env] = envDict[env]
    print("Set environment variable: " + env + "=" + envDict[env])

setEnviron(ENV_SETTINGS)

# GPT-2 Fine-tuning via Google Colab

All the required code to start training the AI model should be here and in-order.

You may need to change a few things to match your setup.

1. Clone gitlab repository and change directory. (please don't share my api key)

In [None]:
!git clone --recursive "https://oauth2:nnsTeRq-kac_WezGE2zo@charon.cs.uni.edu/jmcclain/rpg-maker-2005.git"
%cd rpg-maker-2005

2. Install requirements via pip

In [None]:
# !pip install -r requirements.txt # We only need stuff from gpt-2
!ls
!pip install -r gpt_2/requirements.txt
!pip install tensorflow==2.4.1

3. Change directory to gpt-2 and apply fix for <|endoftext|>

In [None]:
%cd gpt_2
# Download patch for '<|endoftext|>'
!wget https://github.com/nshepperd/gpt-2/commit/26e6d2b3c9f9aab743f283defcadbe025c3bb9c4.patch -O endoftext.patch
# Apply patch
!patch -Np1 -i endoftext.patch
# Show changes
!git diff

4. Link 'checkpoint', 'samples' and 'models' folders (Double check your Settings above)

In [None]:
import os

# Get values from environment variables.
modelsPath = os.environ["MODELS_PATH"]
checkpointPath = os.environ["CHECKPOINT_PATH"]
samplesPath = os.environ["SAMPLES_PATH"]

# Simpily creates a symlink of a folder/file in the current directory.
def createSymLink(path):
  os.symlink(path, os.path.basename(path))

# create symlinks in the current directory
createSymLink(modelsPath)
createSymLink(checkpointPath)
createSymLink(samplesPath)

# List the contents to confirm things are linked correctly.
!ls -lha

6. Download model (If you've ran this before than you can skip it.)

In [None]:
!python download_model.py 355M

7. Create 'encoded.npz'

In [None]:
!echo "$TRAINING_DATA"
!PYTHONPATH=src python encode.py --model_name 355M "$TRAINING_DATA" encoded.npz
!ls -lha

  pre8. Setup Tensor Rematerialization (Better Memory)

  The thing that `train.py` interfaces with for reduced memory usage is written in Haskell, so we have to jump through some hoops to get the Haskell compiler and package manager installed before compiling the program.

In [None]:
import os
import shutil

CACHED_CABAL = os.path.join(os.environ['DRIVE_PATH'], 'cabal-cache.zip')

# Restore '/root/.cabal' from cache
if os.path.isfile(CACHED_CABAL):
  shutil.unpack_archive(CACHED_CABAL, '/root/.cabal')

# Install a working ghc/cabal-install version (via haskell ppa)
!add-apt-repository ppa:hvr/ghc --yes
!apt-get update
!apt-get install ghc-9.0.1 cabal-install-3.4

# Compile twremat
%cd twremat
# For some reason we have too specify the PATH
!PATH="/opt/cabal/bin:/opt/ghc/bin:${PATH}" cabal update
!PATH="/opt/cabal/bin:/opt/ghc/bin:${PATH}" cabal v2-install --installdir=../bin
# make sure '../bin/twremat' is executable.
!chmod +x "../bin/twremat"
%cd ..

# Save '/root/.cabal'
shutil.make_archive(CACHED_CABAL.split('.')[0], 'zip', '/root/.cabal', verbose=1)

8. Start training

In [None]:
# Use this to show gpu info via wandb
!pip install wandb
import wandb
wandb.init()

In [None]:
!PYTHONPATH=src python train.py \
  --sample_every 250 \
  --dataset encoded.npz \
  --model_name 355M \
  --optimizer sgd \
  --learning_rate 0.0006

9. Update model zip

Grabs the required GPT2 model files and creates a zip archive.

In [None]:
import os
import shutil
import json
import hashlib

CHECKPOINT_RUN_PATH = os.path.join(os.environ['CHECKPOINT_PATH'], 'run1')
MODEL_PATH = os.path.join(os.environ['MODELS_PATH'], '355M')

OUTPUT_DIR = os.path.join(os.environ['DRIVE_PATH'], "rpg-model")
OUTPUT_HASH = os.path.join(os.environ['DRIVE_PATH'], "rpg-model_blake2b.json")
OUTPUT_ZIP = os.path.join(os.environ['DRIVE_PATH'], "rpg-model")
CHECKPOINT_FILE_LIST = ['checkpoint', 'counter']
MODEL_FILE_LIST = ['encoder.json', 'hparams.json', 'vocab.bpe']

print("Will output to:", OUTPUT_DIR)

# Make a clean directory
print("Cleaning", OUTPUT_DIR)
if os.path.isdir(OUTPUT_DIR):
  for file in os.listdir(OUTPUT_DIR):
    file = os.path.join(OUTPUT_DIR, file)
    os.remove(file)
  os.rmdir(OUTPUT_DIR)

print("Creating", OUTPUT_DIR)
os.mkdir(OUTPUT_DIR)

# Get current checkpoint
print("Getting latest checkpoint")
checkpoint_num = ""
with open(os.path.join(CHECKPOINT_RUN_PATH, "counter"), 'r') as f:
  for line in f.readlines():
    checkpoint_num += line
  checkpoint_num = int(checkpoint_num)

# Copy checkpoint files
print("Copying checkpoint files")
for file in os.listdir(CHECKPOINT_RUN_PATH):
  if (str(checkpoint_num) in file) and ("tfevents" not in file):
    CHECKPOINT_FILE_LIST.append(file)

for file in CHECKPOINT_FILE_LIST:
  file_path = os.path.join(CHECKPOINT_RUN_PATH, file)
  shutil.copyfile(file_path, os.path.join(OUTPUT_DIR, file))

# Copy model files
print("Copying model files")
for file in MODEL_FILE_LIST:
  file_path = os.path.join(MODEL_PATH, file)
  shutil.copyfile(file_path, os.path.join(OUTPUT_DIR, file))

# Create a blake2b hash for each file and store them
# in a json file.
blake2b_hashes = {}
for file in os.listdir(OUTPUT_DIR):
  file_path = os.path.join(OUTPUT_DIR, file)
  blake2b_hash = hashlib.blake2b()
  with open(file_path, "rb") as f:
    chunk = f.read(8192)
    while chunk:
      blake2b_hash.update(chunk)
      chunk = f.read(8192)
    blake2b_hashes[file] = blake2b_hash.hexdigest()
with open(OUTPUT_HASH, "w") as f:
  json.dump(blake2b_hashes, f, indent=2)

# Zip up the OUTPUT_DIR
print("Zipping:", OUTPUT_DIR, "to", OUTPUT_ZIP + ".zip")
shutil.make_archive(OUTPUT_ZIP, "zip", OUTPUT_DIR)

# Utils

1. Display information about Google Colab's Nvidia GPU

   (Will fail if GPU is not selected in `Edit->Notebook Settings`)

In [None]:
!nvidia-smi

2. Read samples

In [None]:
import os

samplesPath = os.environ["SAMPLES_PATH"]

samples = [os.path.join(samplesPath, "run1", x) for x in os.listdir(os.path.join(samplesPath, "run1"))][-10:]

def printFile(filename: str):
    with open(filename, "r") as f:
      return f.read()

for sample in samples:
  print(printFile(sample))

3. Clean checkpoints

In [None]:
import os

CHECKPOINT_RUN_PATH = os.path.join(os.environ['CHECKPOINT_PATH'], 'run1')
DONT_DELETE = ('checkpoint', 'counter', 'tfevents')

# Get latest epoch
checkpoint = ""
with open(os.path.join(CHECKPOINT_RUN_PATH, 'counter'), 'r') as f:
  for line in f.readlines():
    checkpoint += line
  checkpoint = checkpoint.splitlines()[0]

# Get list of file in CHECKPOINT_RUN_PATH
delete_me = os.listdir(CHECKPOINT_RUN_PATH)

# Create a list of files to not delete
filter = []
for dont in DONT_DELETE:
  for file in delete_me:
    if (dont in file) or (checkpoint in file):
      filter.append(file)

# Filter the delete_me list with the filter list.
delete_me = [x for x in delete_me if x not in filter]

# Delete files in delete_me
for file in delete_me:
  file = os.path.join(CHECKPOINT_RUN_PATH, file)
  if os.path.isfile(file):
    os.remove(file)

# List Files
newlist = os.listdir(CHECKPOINT_RUN_PATH)
newlist.sort()
for x in newlist:
  print(x)