In [16]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login
import re, json, time
from tqdm import tqdm
import torch

user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_token")
login(hf_token)

model_name = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    token=hf_token
)

def describe_melody_local(melody):
    prompt = f"""This is a melody written in ABC notation:

{melody}

Please describe the **mood**, **sound type**, and **rhythm** of this melody using one label from each of the following categories:

Mood:
[happy, sad, emotional, uplifting, tense, melancholy, romantic, angry, calm, dark, energetic, epic, dreamy, nostalgic, hopeful]

Sound Type:
[solo piano, orchestral, synth heavy, synth pads, bass heavy, melodic lead, percussion-driven, guitar-focused]

Rhythm:
[no beat, has steady beat, syncopated, irregular, rhythmic pulse]

Respond in this format:
Mood: <one label>
Sound Type: <one label>
Rhythm: <one label>

Answer:"""

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    output = model.generate(
        **inputs,
        max_new_tokens=80,
        temperature=0.7,
        do_sample=True,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    if "Answer:" in decoded:
        return decoded.split("Answer:")[-1].strip()
    return decoded.strip()

with open("/kaggle/input/abc-notation-music-for-rnn/dataabc.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

tunes = raw_text.strip().split("\nX:")
tunes = ["X:" + t if not t.startswith("X:") else t for t in tunes]

def extract_melody(tune):
    return "\n".join([
        line.strip() for line in tune.splitlines()
        if not re.match(r"^[A-Z]:", line.strip())
    ])

melody_only = [extract_melody(t) for t in tunes if len(extract_melody(t)) > 100]

labeled_data = []

for idx, melody in tqdm(enumerate(melody_only[:1000]), total=1000, desc="Labeling 1000 melodies"):

    try:
        result = describe_melody_local(melody)
        label_dict = {"melody": melody}
        for line in result.splitlines():
            if ":" in line:
                key, value = line.split(":", 1)
                label_dict[key.strip().lower().replace(" ", "_")] = value.strip()
        labeled_data.append(label_dict)

        time.sleep(0.2) 

    except Exception as e:
        print(f"Error on {idx+1}: {e}")

output_path = "/kaggle/working/full_labeled_abc.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(labeled_data, f, indent=2)

print(f"\n Saved {len(labeled_data)} labeled melodies to: {output_path}")


tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Labeling 1000 melodies: 100%|██████████| 1000/1000 [31:53<00:00,  1.91s/it]


 Saved 1000 labeled melodies to: /kaggle/working/full_labeled_abc.json





In [17]:
import json

with open("/kaggle/working/full_labeled_abc.json", "r", encoding="utf-8") as f:
    data = json.load(f)

samples = []
for item in data:
    if all(k in item for k in ("mood", "sound_type", "rhythm", "melody")):
        prompt = f"mood: {item['mood']} | sound_type: {item['sound_type']} | rhythm: {item['rhythm']}"
        target = item["melody"]
        samples.append({"input": prompt, "output": target})


In [18]:
for sample in samples[:3]:
    print("Prompt:", sample["input"])
    print("Target ABC:\n", sample["output"])
    print("-----\n")


Prompt: mood: uplifting | sound_type: melodic lead | rhythm: has steady beat
Target ABC:
 G3-A (Bcd=e) | f4 (g2dB) | ({d}c3-B) G2-E2 | F4 (D2=E^F) |
G3-A (Bcd=e) | f4 d2-f2 | (g2a2 b2).g2 | {b}(a2g2 f2).d2 |
(d2{ed}c2) B2B2 | (A2G2 {AG}F2).D2 | (GABc) (d2{ed}c>A) | G2G2 G2z ||
G | B2c2 (dcAB) | G2G2 G3G | B2d2 (gfdc) | d2g2 (g3ga) |
(bagf) (gd)d>c | (B2AG) F-D.D2 | (GABc) d2d2 | (bgfd) cA.F2 |
G2A2 (B2{cB}AG) | A3-G F2-D2 | (GABc) (d2{ed}c>A) | G2G2 G2z2 ||

-----

Prompt: mood: uplifting | sound_type: melodic lead | rhythm: has steady beat
Target ABC:
 f-g | a3-b g3-a | f4 e3-d | d3-c A3-B | c4 d3-e |
d3-c (3(A2G2F2) | G4F2-G2 | A-d3 d3-e | d6 ||
A2 | d3-e f3-g | a4 a3-g | a3-b a3-f | g4 g3-g |
a3-b a3-g | {e}=f4 e3-c | d3-c A3-G | A6 f-g |
a3-b g3-a | f4 e3-d | d3-c A3-B | c4 d3-e |
d3-c (3(A2G2F2) | G4 F2-G2 | A-d3 d3-e | d6 ||

-----

Prompt: mood: epic | sound_type: synth heavy | rhythm: has steady beat
Target ABC:
 B/2-c/2 | d2 d>-c B2 A-B | (GBAG) F2 D-F | (G>AG).F (D>CD).F | G2

In [19]:
from transformers import T5Tokenizer
from datasets import Dataset

dataset = Dataset.from_list(samples)

tokenizer = T5Tokenizer.from_pretrained("t5-small")

max_input_len = 128
max_target_len = 512

def preprocess(example):
    model_input = tokenizer(example["input"], padding="max_length", truncation=True, max_length=max_input_len)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["output"], padding="max_length", truncation=True, max_length=max_target_len)
    model_input["labels"] = labels["input_ids"]
    return model_input

tokenized_dataset = dataset.map(preprocess, remove_columns=["input", "output"])


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



In [20]:
from transformers import T5ForConditionalGeneration, TrainingArguments, Trainer

model = T5ForConditionalGeneration.from_pretrained("t5-small")

training_args = TrainingArguments(
    output_dir="/kaggle/working/t5_abc_model",
    per_device_train_batch_size=4,
    num_train_epochs=6,
    logging_steps=100,                
    save_total_limit=1,
    save_strategy="epoch",
    fp16=True,
    logging_dir="/kaggle/working/logs",  
    report_to="none",                
    disable_tqdm=False               
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

trainer.train()




Step,Training Loss
100,2.745
200,1.1623
300,1.0634
400,0.9814
500,0.977
600,0.95
700,0.9341




TrainOutput(global_step=750, training_loss=1.2377568918863933, metrics={'train_runtime': 271.386, 'train_samples_per_second': 22.109, 'train_steps_per_second': 2.764, 'total_flos': 203012702208000.0, 'train_loss': 1.2377568918863933, 'epoch': 6.0})

In [21]:
model.save_pretrained("/kaggle/working/t5_abc_model")
tokenizer.save_pretrained("/kaggle/working/t5_abc_model")



('/kaggle/working/t5_abc_model/tokenizer_config.json',
 '/kaggle/working/t5_abc_model/special_tokens_map.json',
 '/kaggle/working/t5_abc_model/spiece.model',
 '/kaggle/working/t5_abc_model/added_tokens.json')

In [22]:
from music21 import converter
from IPython.display import Audio

def abc_to_midi(abc_string, midi_path="/kaggle/working/generated.mid"):
    if "K:" not in abc_string:
        abc_string = "X:1\nT:Generated\nM:4/4\nK:C\n" + abc_string

    try:
        score = converter.parse(abc_string, format='abc')
        score.write('midi', fp=midi_path)
        return midi_path
    except Exception as e:
        print("Failed:", e)
        return None


In [23]:
def standardize_abc(abc):
    abc = abc.strip()
    if "K:" not in abc:
        abc = "K:C\n" + abc
    if "M:" not in abc:
        abc = "M:4/4\n" + abc
    if not abc.startswith("X:"):
        abc = "X:1\nT:Generated\n" + abc
    if "Z:" not in abc:
        abc += "\nZ:1" 
    return abc

samples = []
for item in data:
    if all(k in item for k in ("mood", "sound_type", "rhythm", "melody")):
        melody = standardize_abc(item["melody"])
        if len(melody.split()) > 20:
            prompt = f"mood: {item['mood']} | sound_type: {item['sound_type']} | rhythm: {item['rhythm']}"
            samples.append({"input": prompt, "output": melody})


In [24]:
from datasets import Dataset
from transformers import T5Tokenizer

dataset = Dataset.from_list(samples)
tokenizer = T5Tokenizer.from_pretrained("t5-small")

def preprocess(example):
    model_input = tokenizer(example["input"], padding="max_length", truncation=True, max_length=128)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["output"], padding="max_length", truncation=True, max_length=512)
    model_input["labels"] = labels["input_ids"]
    return model_input

tokenized_dataset = dataset.map(preprocess, remove_columns=["input", "output"])


Map:   0%|          | 0/982 [00:00<?, ? examples/s]



In [37]:
import re

def is_spam(abc):
    tokens = abc.split()
    if tokens.count("f2") > 15:
        return True
    if len(set(tokens)) < 6:
        return True
    return False

def clean_abc_output(abc):
    if "Z:" in abc:
        abc = abc.split("Z:")[0].strip()
    if "K:" in abc:
        abc = "K:" + abc.split("K:")[1]
    return abc.strip()

def generate_abc(prompt_text, max_new_tokens=400, retries=3):
    for _ in range(retries):
        inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device)
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            min_length=80,
            temperature=0.9,
            top_p=0.95,
            repetition_penalty=1.7,
            no_repeat_ngram_size=6,
            pad_token_id=tokenizer.eos_token_id
        )
        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
        cleaned = clean_abc_output(result)
        if not is_spam(cleaned):
            return cleaned
    return cleaned  


In [38]:
from music21 import converter
import re

def save_abc_to_midi(abc_string, filename="generated.mid", folder="/kaggle/working"):
    abc_clean = re.sub(r"(?m)^[XTMKLZ]:.*$", "", abc_string).strip()

    abc_full = "X:1\nT:Generated\nM:4/4\nK:C\n" + abc_clean + "\nZ:1"

    try:
        score = converter.parse(abc_full, format='abc')
        midi_path = f"{folder}/{filename}"
        score.write('midi', fp=midi_path)
        print(f"Saved to: {midi_path}")
    except Exception as e:
        print("Error:", e)


In [39]:
abc = generate_abc("mood: melancholic | sound_type: melodic lead | rhythm: no beat")
save_abc_to_midi(abc, filename="dreamy_piano.mid")


Saved to: /kaggle/working/dreamy_piano.mid
