# Moonbeam LoRA Style Training (Colab)
Train a LoRA adapter (e.g., Chopin/Liszt/Bach style) from your own MIDI files, then reuse the adapter in `Moonbeam_Quickstart.ipynb`.


## 1) Runtime setup
Use **GPU runtime** in Colab.


In [None]:
import torch
print('CUDA available:', torch.cuda.is_available())
if not torch.cuda.is_available():
    raise RuntimeError('Please switch Colab runtime to GPU.')


## 2) Clone repo and install dependencies (exact README commands)


In [None]:
import os
from pathlib import Path

repo_dir = Path('/content/Moonbeam-MIDI-Foundation-Model')
if repo_dir.exists():
    os.chdir(repo_dir)
    os.system('git fetch origin --prune')
    if os.system('git reset --hard origin/main') != 0:
        os.system('git reset --hard origin/master')
else:
    os.system('git clone https://github.com/guozixunnicolas/Moonbeam-MIDI-Foundation-Model /content/Moonbeam-MIDI-Foundation-Model')

%cd /content/Moonbeam-MIDI-Foundation-Model
!pip install .
!pip install src/llama_recipes/transformers_minimal/.
!pip install huggingface_hub pandas mido


## 3) Download pretrained checkpoint


In [None]:
from huggingface_hub import hf_hub_download

CKPT_FILENAME = 'moonbeam_309M.pt'  #@param ['moonbeam_309M.pt', 'moonbeam_839M.pt']
ckpt_path = hf_hub_download(
    repo_id='guozixunnicolas/moonbeam-midi-foundation-model',
    filename=CKPT_FILENAME,
)
print('Checkpoint:', ckpt_path)


## 4) Upload style MIDI zip and preprocess to `.npy` + split CSV


In [None]:
from pathlib import Path
from google.colab import files
import zipfile
import random
import numpy as np
import pandas as pd

from transformers import LlamaConfig
from llama_recipes.datasets.music_tokenizer import MusicTokenizer

STYLE_NAME = 'chopin'  #@param {type:"string"}
TRAIN_RATIO = 0.9  #@param {type:"number"}

uploaded = files.upload()
if not uploaded:
    raise RuntimeError('Please upload a zip containing MIDI files.')
zip_name = next(iter(uploaded.keys()))

work_dir = Path('/content/style_data') / STYLE_NAME
raw_dir = work_dir / 'raw_midis'
proc_dir = work_dir / 'processed'
raw_dir.mkdir(parents=True, exist_ok=True)
proc_dir.mkdir(parents=True, exist_ok=True)

with zipfile.ZipFile(zip_name, 'r') as zf:
    zf.extractall(raw_dir)

midi_files = sorted([p for p in raw_dir.rglob('*') if p.suffix.lower() in {'.mid', '.midi'}])
if not midi_files:
    raise RuntimeError('No MIDI files found in uploaded zip.')

cfg = LlamaConfig.from_pretrained('src/llama_recipes/configs/model_config.json')
tokenizer = MusicTokenizer(
    timeshift_vocab_size=cfg.onset_vocab_size,
    dur_vocab_size=cfg.dur_vocab_size,
    octave_vocab_size=cfg.octave_vocab_size,
    pitch_class_vocab_size=cfg.pitch_class_vocab_size,
    instrument_vocab_size=cfg.instrument_vocab_size,
    velocity_vocab_size=cfg.velocity_vocab_size,
)

rows = []
for idx, midi_path in enumerate(midi_files):
    try:
        tokens = tokenizer.midi_to_compound(str(midi_path))
        arr = np.asarray(tokens, dtype=np.int16)
        if arr.size == 0:
            continue
        out_name = f'{STYLE_NAME}_{idx:05d}.npy'
        np.save(proc_dir / out_name, arr)
        rows.append({'file_base_name': out_name})
    except Exception as e:
        print(f'[skip] {midi_path.name}: {e}')

if len(rows) < 2:
    raise RuntimeError('Need at least 2 valid MIDI files after preprocessing.')

random.seed(42)
random.shuffle(rows)
split_idx = max(1, min(len(rows)-1, int(len(rows) * float(TRAIN_RATIO))))
for i, r in enumerate(rows):
    r['split'] = 'train' if i < split_idx else 'test'

split_csv = work_dir / f'{STYLE_NAME}_split.csv'
pd.DataFrame(rows).to_csv(split_csv, index=False)

print('Preprocessed files:', len(rows))
print('Data dir:', work_dir)
print('CSV:', split_csv)


## 5) Run LoRA finetuning (unconditional generation)


In [None]:
STYLE_NAME = 'chopin'  #@param {type:"string"}
NUM_EPOCHS = 5  #@param {type:"integer"}
BATCH_SIZE = 1  #@param {type:"integer"}
LR = 0.0003  #@param {type:"number"}
CONTEXT_LENGTH = 1024  #@param {type:"integer"}

from pathlib import Path
style_root = Path('/content/style_data') / STYLE_NAME
split_csv = style_root / f'{STYLE_NAME}_split.csv'

cmd = f'''torchrun --nnodes 1 --nproc_per_node 1 recipes/finetuning/real_finetuning_uncon_gen.py \
  --lr {LR} \
  --val_batch_size 1 \
  --run_validation True \
  --validation_interval 20 \
  --save_metrics True \
  --dist_checkpoint_root_folder checkpoints/finetuned_checkpoints/{STYLE_NAME}_lora \
  --dist_checkpoint_folder ddp \
  --trained_checkpoint_path {ckpt_path} \
  --pure_bf16 True \
  --enable_ddp True \
  --use_peft True \
  --peft_method lora \
  --quantization False \
  --model_name moonbeam_{STYLE_NAME} \
  --dataset lakhmidi_dataset \
  --data_dir {style_root} \
  --csv_file {split_csv} \
  --output_dir checkpoints/finetuned_checkpoints/{STYLE_NAME}_lora \
  --batch_size_training {BATCH_SIZE} \
  --context_length {CONTEXT_LENGTH} \
  --num_epochs {NUM_EPOCHS} \
  --use_wandb False \
  --gamma 0.99'''
print(cmd)
!{cmd}


## 6) Zip LoRA adapter for upload into quickstart notebook


In [None]:
from pathlib import Path
import zipfile

STYLE_NAME = 'chopin'  #@param {type:"string"}
adapter_root = Path('checkpoints/finetuned_checkpoints') / f'{STYLE_NAME}_lora'

candidate_dirs = [d for d in [adapter_root, *adapter_root.rglob('*')] if d.is_dir() and (d / 'adapter_config.json').exists()]
if not candidate_dirs:
    raise RuntimeError('No adapter_config.json found. Check training output folder.')

adapter_dir = candidate_dirs[0]
zip_path = Path('/content') / f'{STYLE_NAME}_lora_adapter.zip'
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
    for file_path in adapter_dir.rglob('*'):
        if file_path.is_file():
            zf.write(file_path, file_path.relative_to(adapter_dir.parent))

print('Adapter zip:', zip_path)
