# Piano MIDI Generation - Inference

This notebook generates MIDI files from the trained model using metadata conditions.

## Overview

1. **Load model and vocabulary** - From checkpoint and processed data
2. **Setup tokenizers** - For metadata and MIDI conversion
3. **Generate MIDI tokens** - Autoregressive generation from metadata
4. **Convert tokens to MIDI** - Reverse tokenization to MIDI format
5. **Save MIDI file** - Export as .mid for DAW import

## Usage

Set your desired metadata (genre, composer, period) and run the generation cell.
The output MIDI file can be imported into FL Studio or any other DAW.


## Step 1: Imports and Setup


In [1]:
import torch
import torch.nn as nn
import json
from pathlib import Path
import re
from typing import List, Optional, Dict
import mido
from datetime import datetime

# Import model architecture
try:
    from model import PianoMIDIGenerator, PositionalEncoding, TransformerBlock
    print("‚úÖ Model classes imported from model.py")
except ImportError:
    print("‚ö†Ô∏è  Could not import from model.py")
    print("   Please ensure model.py exists in the same directory")
    raise

print("‚úÖ Libraries imported")


‚úÖ Model classes imported from model.py
‚úÖ Libraries imported


## Step 2: Load Model and Vocabulary


In [2]:
# Configuration
DATA_DIR = Path("processed_data")
CHECKPOINT_DIR = Path("checkpoints")
OUTPUT_DIR = Path("generated_midi")
OUTPUT_DIR.mkdir(exist_ok=True)

# Load vocabulary
with open(DATA_DIR / "vocab.json", 'r') as f:
    vocab = json.load(f)

with open(DATA_DIR / "id_to_token.json", 'r') as f:
    id_to_token = json.load(f)
    # Convert keys to int
    id_to_token = {int(k): v for k, v in id_to_token.items()}

vocab_size = len(vocab)
pad_token_id = vocab.get('<PAD>', 0)
start_token_id = vocab.get('<START>', 2)
end_token_id = vocab.get('<END>', 3)

print(f"‚úÖ Loaded vocabulary: {vocab_size:,} tokens")
print(f"   Pad token ID: {pad_token_id}")
print(f"   Start token ID: {start_token_id}")
print(f"   End token ID: {end_token_id}")


‚úÖ Loaded vocabulary: 746 tokens
   Pad token ID: 0
   Start token ID: 2
   End token ID: 3


In [3]:
# Load model configuration
with open(DATA_DIR / "preprocessing_config.json", 'r') as f:
    preprocess_config = json.load(f)

MODEL_CONFIG = {
    'vocab_size': vocab_size,
    'max_seq_length': preprocess_config['max_sequence_length'],
    'd_model': 512,
    'n_layers': 6,
    'n_heads': 8,
    'd_ff': 2048,
    'dropout': 0.0,  # No dropout during inference
    'pad_token_id': pad_token_id,
}

print(f"‚úÖ Model config loaded:")
print(f"   Vocab size: {MODEL_CONFIG['vocab_size']}")
print(f"   Max sequence length: {MODEL_CONFIG['max_seq_length']}")


‚úÖ Model config loaded:
   Vocab size: 746
   Max sequence length: 2048


In [4]:
# Setup device
device = torch.device('cpu')
print(f"‚úÖ Using device: {device}")

# Load model checkpoint
checkpoint_path = CHECKPOINT_DIR / 'checkpoint_best.pt'

if not checkpoint_path.exists():
    print(f"‚ö†Ô∏è  Best checkpoint not found at {checkpoint_path}")
    print(f"   Available checkpoints:")
    for cp in CHECKPOINT_DIR.glob('*.pt'):
        print(f"     - {cp.name}")
    raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")

print(f"üìÇ Loading checkpoint: {checkpoint_path}")
checkpoint = torch.load(checkpoint_path, map_location=device)

# Create model
model = PianoMIDIGenerator(MODEL_CONFIG)
model.load_state_dict(checkpoint['model_state_dict'])
model = model.to(device)
model.eval()  # Set to evaluation mode

print(f"‚úÖ Model loaded from checkpoint:")
print(f"   Epoch: {checkpoint['epoch'] + 1}")
print(f"   Step: {checkpoint['step']:,}")
print(f"   Validation loss: {checkpoint['val_loss']:.4f}")
print(f"   Model parameters: {sum(p.numel() for p in model.parameters())/1e6:.1f}M")


‚úÖ Using device: cpu
üìÇ Loading checkpoint: checkpoints\checkpoint_best.pt
‚úÖ Model loaded from checkpoint:
   Epoch: 8
   Step: 11,900
   Validation loss: 2.4088
   Model parameters: 19.7M


## Step 3: Setup Tokenizers


In [5]:
# Metadata tokenizer (same as preprocessing)
class MetadataTokenizer:
    def __init__(self, include_composer=True, top_n_composers=100):
        self.include_composer = include_composer
        self.valid_genres = {'classical', 'pop', 'soundtrack', 'jazz', 'rock', 'folk', 'ambient', 'ragtime', 'blues', 'atonal'}
        self.valid_periods = {'contemporary', 'modern', 'romantic', 'classical', 'baroque', 'impressionist'}
        self.top_composers = self._load_top_composers(top_n_composers)
    
    def _load_top_composers(self, n):
        top = {'hisaishi', 'satie', 'yiruma', 'einaudi', 'joplin', 'chopin', 'beethoven', 'bach', 'mozart', 'debussy',
               'schubert', 'schumann', 'liszt', 'rachmaninoff', 'tchaikovsky', 'ravel', 'poulenc', 'faure', 'bartok'}
        return {self._normalize_composer(c) for c in top}
    
    def _normalize_composer(self, composer):
        if not composer:
            return ""
        normalized = composer.lower().strip()
        normalized = normalized.replace('√©', 'e').replace('√®', 'e').replace('√°', 'a').replace('√†', 'a')
        normalized = normalized.replace('√≠', 'i').replace('√¨', 'i').replace('√≥', 'o').replace('√≤', 'o')
        normalized = normalized.replace('√∫', 'u').replace('√π', 'u').replace('√±', 'n')
        normalized = re.sub(r'[^a-z0-9\s-]', '', normalized)
        normalized = re.sub(r'\s+', ' ', normalized).strip()
        return normalized
    
    def metadata_to_tokens(self, metadata, include_start=True):
        tokens = []
        if include_start:
            tokens.append("START")
        
        if metadata.get('genre'):
            genre = metadata['genre'].lower().strip()
            if genre in self.valid_genres:
                tokens.append(f"GENRE:{genre}")
        
        if metadata.get('music_period'):
            period = metadata['music_period'].lower().strip()
            if period in self.valid_periods:
                tokens.append(f"PERIOD:{period}")
        
        if self.include_composer and metadata.get('composer'):
            composer = self._normalize_composer(metadata['composer'])
            if composer in self.top_composers:
                tokens.append(f"COMPOSER:{composer}")
        
        return tokens

meta_tokenizer = MetadataTokenizer(include_composer=True)
print("‚úÖ Metadata tokenizer created")


‚úÖ Metadata tokenizer created


In [6]:
# MIDI tokenizer (reverse conversion)
class MIDITokenizer:
    def __init__(self, time_quantization=10):
        self.time_quantization = time_quantization
    
    def tokens_to_midi(self, tokens: List[str], output_path: Path, tempo=120, ticks_per_beat=480):
        """
        Convert tokens back to MIDI file
        
        Args:
            tokens: List of token strings
            output_path: Path to save MIDI file
            tempo: Tempo in BPM
            ticks_per_beat: MIDI ticks per quarter note (default 480)
                           IMPORTANT: Must match the ticks_per_beat of original MIDI files
                           to preserve correct timing. Try 480, 960, or 384 if timing is off.
        """
        mid = mido.MidiFile(ticks_per_beat=ticks_per_beat)
        track = mido.MidiTrack()
        mid.tracks.append(track)
        
        # Set tempo (microseconds per quarter note)
        tempo_us = mido.bpm2tempo(tempo)
        track.append(mido.MetaMessage('set_tempo', tempo=tempo_us))
        
        # IMPORTANT: TIME_SHIFT values represent quantized MIDI ticks from the original files
        # The preprocessing: quantized_time = (current_time // time_quantization) * time_quantization
        # where current_time is accumulated from msg.time (which is in ticks from original MIDI)
        # So TIME_SHIFT:1280 means 1280 ticks from the ORIGINAL file's ticks_per_beat
        # If original had 960 ticks_per_beat and we use 480, timing will be compressed by 2x!
        # We use these tick values directly, assuming ticks_per_beat matches the original
        
        current_time_ticks = 0  # Accumulate time in MIDI ticks
        pending_note_on = None  # (note, velocity, start_time)
        
        i = 0
        while i < len(tokens):
            token = tokens[i]
            
            # Skip metadata and control tokens
            if token.startswith('START') or token.startswith('GENRE:') or token.startswith('PERIOD:') or token.startswith('COMPOSER:') or token in ['<END>', '<PAD>', '<UNK>']:
                i += 1
                continue
            
            # Parse TIME_SHIFT (value is in MIDI ticks, already quantized)
            # Handle TIME_SHIFT: accumulate consecutive TIME_SHIFT tokens until we hit a note event
            # This handles cases where model generates multiple TIME_SHIFT tokens
            if token.startswith('TIME_SHIFT:'):
                # Accumulate all consecutive TIME_SHIFT tokens
                accumulated_ticks = 0
                while i < len(tokens) and tokens[i].startswith('TIME_SHIFT:'):
                    time_ticks = int(tokens[i].split(':')[1])
                    accumulated_ticks += time_ticks
                    i += 1
                current_time_ticks = accumulated_ticks
                continue  # Continue to process the next token (should be a note event)
            # Parse NOTE_ON
            elif token.startswith('NOTE_ON:'):
                note = int(token.split(':')[1])
                velocity = 64  # Default velocity
                
                # Check if next token is VELOCITY
                if i + 1 < len(tokens) and tokens[i + 1].startswith('VELOCITY:'):
                    velocity = int(tokens[i + 1].split(':')[1])
                    i += 2
                else:
                    i += 1
                
                # TIME_SHIFT values are already in MIDI ticks, use directly
                track.append(mido.Message('note_on', channel=0, note=note, velocity=velocity, time=current_time_ticks))
                current_time_ticks = 0  # Reset accumulated time
                pending_note_on = (note, velocity)
            
            # Parse NOTE_OFF
            elif token.startswith('NOTE_OFF:'):
                note = int(token.split(':')[1])
                
                # TIME_SHIFT values are already in MIDI ticks, use directly
                track.append(mido.Message('note_off', channel=0, note=note, velocity=0, time=current_time_ticks))
                current_time_ticks = 0  # Reset accumulated time
                i += 1
            
            # Parse VELOCITY (standalone - should not happen in proper sequence)
            elif token.startswith('VELOCITY:'):
                i += 1
                continue
            else:
                i += 1
        
        mid.save(output_path)
        return mid

midi_tokenizer = MIDITokenizer(time_quantization=10)
print("‚úÖ MIDI tokenizer created")


‚úÖ MIDI tokenizer created


In [7]:
def generate_midi(
    model,
    vocab,
    id_to_token,
    meta_tokenizer,
    metadata: Dict[str, str],
    max_length: int = 1024,
    min_length: int = 500,
    temperature: float = 1.0,
    top_k: Optional[int] = 50,
    device='cpu'
):
    """
    Generate MIDI tokens from metadata conditions.
    
    Args:
        model: Trained PianoMIDIGenerator model
        vocab: Token to ID mapping
        id_to_token: ID to token mapping
        meta_tokenizer: MetadataTokenizer instance
        metadata: Dictionary with 'genre', 'composer', 'music_period' keys
        max_length: Maximum generation length
        min_length: Minimum generation length before allowing early stop
        temperature: Sampling temperature (higher = more random)
        top_k: Top-k sampling (None = no filtering)
        device: Device to run generation on
    
    Returns:
        List of generated tokens
    """
    model.eval()
    
    # Convert metadata to tokens
    metadata_tokens = meta_tokenizer.metadata_to_tokens(metadata, include_start=True)
    print(f"üìù Metadata tokens: {metadata_tokens}")
    
    # Convert metadata tokens to IDs
    input_ids = [vocab.get(token, vocab.get('<UNK>', 1)) for token in metadata_tokens]
    
    # Generation loop
    generated_tokens = metadata_tokens.copy()
    
    with torch.no_grad():
        while len(generated_tokens) < max_length:
            # Prepare input (limit to max_seq_length)
            current_input = input_ids[-MODEL_CONFIG['max_seq_length']:] if len(input_ids) > MODEL_CONFIG['max_seq_length'] else input_ids
            input_tensor = torch.tensor([current_input], dtype=torch.long, device=device)
            attention_mask = torch.ones_like(input_tensor, dtype=torch.long)
            
            # Forward pass
            logits = model(input_tensor, attention_mask=attention_mask)
            
            # Get logits for last position
            next_token_logits = logits[0, -1, :] / temperature
            
            # Apply top-k filtering
            if top_k is not None:
                top_k_logits, top_k_indices = torch.topk(next_token_logits, top_k)
                # Create filtered logits
                filtered_logits = torch.full_like(next_token_logits, float('-inf'))
                filtered_logits[top_k_indices] = top_k_logits
                next_token_logits = filtered_logits
            
            # Sample from distribution
            probs = torch.softmax(next_token_logits, dim=-1)
            next_token_id = torch.multinomial(probs, num_samples=1).item()
            
            # Convert to token
            next_token = id_to_token.get(next_token_id, '<UNK>')
            generated_tokens.append(next_token)
            input_ids.append(next_token_id)
            
            # Stop if end token (only if we've reached minimum length)
            if (next_token == '<END>' or next_token_id == vocab.get('<END>', 3)) and len(generated_tokens) >= min_length:
                print(f"   Reached <END> token at {len(generated_tokens)} tokens (min: {min_length})")
                break
            
            # Prevent infinite loops
            if len(generated_tokens) % 100 == 0:
                print(f"   Generated {len(generated_tokens)} tokens...", end='\r')
    
    print()  # New line after progress
    return generated_tokens

print("‚úÖ Generation function defined")


‚úÖ Generation function defined


## Step 5: Generate MIDI File


In [8]:
# ============================================
# GENERATION CONFIGURATION
# ============================================
# Modify these values to change generation settings

METADATA = {
    'genre': 'classical',  # Options: 'classical', 'pop', 'jazz', 'soundtrack', 'rock', 'folk', 'ambient', etc.
    'composer': 'chopin',  # Options: 'chopin', 'beethoven', 'bach', 'mozart', 'debussy', 'yiruma', 'einaudi', etc. (or None)
    'music_period': 'romantic',  # Options: 'romantic', 'classical', 'baroque', 'contemporary', 'modern', 'impressionist' (or None)
}

GENERATION_CONFIG = {
    'max_length': 2000,  # Maximum number of tokens to generate (matches training sequence length ~1-2 minutes)
    'temperature': 0.8,  # Lower = more deterministic, Higher = more creative (0.5-1.5 range)
    'top_k': 50,  # Top-k sampling: only sample from top K most likely tokens (None to disable)
    'tempo': 120,  # MIDI tempo (BPM)
    'min_length': 1500,  # Minimum tokens to generate before allowing early stop (ensures ~1 min minimum)
}

print("üìù Generation Configuration:")
print(f"   Metadata: {METADATA}")
print(f"   Max length: {GENERATION_CONFIG['max_length']}")
print(f"   Min length: {GENERATION_CONFIG.get('min_length', 500)}")
print(f"   Temperature: {GENERATION_CONFIG['temperature']}")
print(f"   Top-k: {GENERATION_CONFIG['top_k']}")
print(f"   Tempo: {GENERATION_CONFIG['tempo']} BPM")


üìù Generation Configuration:
   Metadata: {'genre': 'classical', 'composer': 'chopin', 'music_period': 'romantic'}
   Max length: 2000
   Min length: 1500
   Temperature: 0.8
   Top-k: 50
   Tempo: 120 BPM


In [9]:
# Generate tokens
print("\nüéπ Generating MIDI tokens...")
print("=" * 60)

generated_tokens = generate_midi(
    model=model,
    vocab=vocab,
    id_to_token=id_to_token,
    meta_tokenizer=meta_tokenizer,
    metadata=METADATA,
    max_length=GENERATION_CONFIG['max_length'],
    min_length=GENERATION_CONFIG.get('min_length', 500),
    temperature=GENERATION_CONFIG['temperature'],
    top_k=GENERATION_CONFIG['top_k'],
    device=device
)

print(f"\n‚úÖ Generated {len(generated_tokens)} tokens")
print(f"\nFirst 50 tokens: {generated_tokens[:50]}")

# Filter out metadata tokens for MIDI conversion
midi_tokens = [
    token for token in generated_tokens 
    if not (token.startswith('START') or token.startswith('GENRE:') or 
            token.startswith('PERIOD:') or token.startswith('COMPOSER:') or 
            token in ['<END>', '<PAD>', '<UNK>'])
]

print(f"\nüìä MIDI tokens (filtered): {len(midi_tokens)} tokens")
print(f"\nFirst 30 MIDI tokens: {midi_tokens[:30]}")



üéπ Generating MIDI tokens...
üìù Metadata tokens: ['START', 'GENRE:classical', 'PERIOD:romantic', 'COMPOSER:chopin']
   Generated 2000 tokens...

‚úÖ Generated 2000 tokens

First 50 tokens: ['START', 'GENRE:classical', 'PERIOD:romantic', 'COMPOSER:chopin', 'TIME_SHIFT:390', 'NOTE_ON:72', 'VELOCITY:65', 'TIME_SHIFT:30', 'NOTE_OFF:72', 'TIME_SHIFT:20', 'NOTE_OFF:72', 'TIME_SHIFT:10', 'NOTE_ON:48', 'VELOCITY:45', 'NOTE_ON:60', 'VELOCITY:65', 'NOTE_ON:72', 'VELOCITY:75', 'TIME_SHIFT:10', 'NOTE_ON:63', 'VELOCITY:60', 'TIME_SHIFT:20', 'NOTE_OFF:60', 'TIME_SHIFT:10', 'NOTE_OFF:48', 'TIME_SHIFT:10', 'NOTE_OFF:63', 'TIME_SHIFT:70', 'NOTE_OFF:67', 'TIME_SHIFT:60', 'NOTE_ON:77', 'VELOCITY:85', 'TIME_SHIFT:10', 'NOTE_ON:56', 'VELOCITY:75', 'TIME_SHIFT:10', 'NOTE_ON:65', 'VELOCITY:75', 'TIME_SHIFT:20', 'NOTE_OFF:75', 'NOTE_OFF:70', 'TIME_SHIFT:10', 'NOTE_OFF:56', 'TIME_SHIFT:100', 'NOTE_ON:75', 'VELOCITY:90', 'TIME_SHIFT:10', 'NOTE_ON:75', 'VELOCITY:85', 'TIME_SHIFT:10']

üìä MIDI tokens (filt

In [10]:
# Convert tokens to MIDI and save
print("\nüéµ Converting tokens to MIDI file...")

# Generate filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
genre_str = METADATA.get('genre', 'unknown')
composer_str = METADATA.get('composer', 'unknown')
filename = f"generated_{genre_str}_{composer_str}_{timestamp}.mid"
output_path = OUTPUT_DIR / filename

# Convert and save
# IMPORTANT: Original MIDI files use ticks_per_beat=500
# TIME_SHIFT values are quantized MIDI ticks from files with ticks_per_beat=500
# We must use ticks_per_beat=500 to preserve correct timing!
midi_tokenizer.tokens_to_midi(
    tokens=midi_tokens,
    output_path=output_path,
    tempo=GENERATION_CONFIG['tempo'],
    ticks_per_beat=500  # MUST match original files' ticks_per_beat (verified: all files use 500)
)

print(f"\n‚úÖ MIDI file saved: {output_path}")
print(f"   File size: {output_path.stat().st_size / 1024:.2f} KB")
print(f"\nüìÅ You can now import this file into FL Studio or any DAW!")



üéµ Converting tokens to MIDI file...

‚úÖ MIDI file saved: generated_midi\generated_classical_chopin_20251103_110342.mid
   File size: 2.62 KB

üìÅ You can now import this file into FL Studio or any DAW!


In [11]:
# ============================================
# TIMING DIAGNOSTIC
# ============================================
# If your generated MIDI is too fast/slow, try different ticks_per_beat values
# This cell helps you test different values

# Calculate total time from TIME_SHIFT tokens
total_time_ticks = 0
for token in midi_tokens:
    if token.startswith('TIME_SHIFT:'):
        total_time_ticks += int(token.split(':')[1])

print(f"üìä Timing Analysis:")
print(f"   Total TIME_SHIFT ticks: {total_time_ticks:,}")
print(f"   Number of MIDI events: {len([t for t in midi_tokens if t.startswith('NOTE_ON') or t.startswith('NOTE_OFF')])}")

# Calculate duration at different ticks_per_beat values
# Original files use ticks_per_beat=500, so that should give correct timing
for tpb in [240, 384, 480, 500, 960]:
    # At tempo BPM, beats per second = tempo/60
    # ticks per second = ticks_per_beat * beats_per_second
    # duration = total_ticks / ticks_per_second
    tempo = GENERATION_CONFIG['tempo']
    ticks_per_second = tpb * tempo / 60
    duration_seconds = total_time_ticks / ticks_per_second
    duration_minutes = duration_seconds / 60
    
    marker = " ‚≠ê CORRECT (matches original files)" if tpb == 500 else ""
    print(f"   At {tpb} ticks_per_beat: {duration_seconds:.1f}s ({duration_minutes:.2f} min){marker}")

print(f"\nüí° Original MIDI files use ticks_per_beat=500")
print(f"   Use ticks_per_beat=500 for correct timing (already set in generation cell above)")


üìä Timing Analysis:
   Total TIME_SHIFT ticks: 18,990
   Number of MIDI events: 797
   At 240 ticks_per_beat: 39.6s (0.66 min)
   At 384 ticks_per_beat: 24.7s (0.41 min)
   At 480 ticks_per_beat: 19.8s (0.33 min)
   At 500 ticks_per_beat: 19.0s (0.32 min) ‚≠ê CORRECT (matches original files)
   At 960 ticks_per_beat: 9.9s (0.16 min)

üí° Original MIDI files use ticks_per_beat=500
   Use ticks_per_beat=500 for correct timing (already set in generation cell above)


## Step 6: Convert MIDI to WAV using Soundfonts

Convert the generated MIDI file to WAV audio using one of the available soundfonts (SF2 files).


In [12]:
# Import audio processing libraries
import subprocess
import random
import os

# Custom fluidsynth path (Windows)
CUSTOM_FLUIDSYNTH_PATH = r"C:\Users\Vikas Gari\Downloads\fluidsynth-v2.5.1-win10-x64-cpp11\fluidsynth-v2.5.1-win10-x64-cpp11\bin"
FLUIDSYNTH_EXE = os.path.join(CUSTOM_FLUIDSYNTH_PATH, "fluidsynth.exe")

# Global flags for fluidsynth availability
USE_FLUIDSYNTH_LIB = False
USE_FLUIDSYNTH_CMD = False
FLUIDSYNTH_CMD = None  # Will be set to the command to use

# Try to use pyfluidsynth library if available
try:
    import fluidsynth
    USE_FLUIDSYNTH_LIB = True
    print("‚úÖ Found fluidsynth Python library")
except ImportError:
    USE_FLUIDSYNTH_LIB = False

# Check custom path first, then system PATH
fluidsynth_found = False

# Check custom path
if os.path.exists(FLUIDSYNTH_EXE):
    FLUIDSYNTH_CMD = FLUIDSYNTH_EXE
    try:
        result = subprocess.run([FLUIDSYNTH_CMD, '--version'], capture_output=True, text=True, timeout=2)
        if result.returncode == 0:
            USE_FLUIDSYNTH_CMD = True
            fluidsynth_found = True
            print(f"‚úÖ Found fluidsynth at custom path: {FLUIDSYNTH_CMD}")
            print(f"   Version: {result.stdout.strip()}")
    except Exception as e:
        pass

# If not found in custom path, check system PATH
if not fluidsynth_found:
    try:
        result = subprocess.run(['fluidsynth', '--version'], capture_output=True, text=True, timeout=2)
        if result.returncode == 0:
            USE_FLUIDSYNTH_CMD = True
            FLUIDSYNTH_CMD = 'fluidsynth'
            fluidsynth_found = True
            print("‚úÖ Found fluidsynth in system PATH")
            print(f"   Version: {result.stdout.strip()}")
    except (FileNotFoundError, subprocess.TimeoutExpired):
        pass

if not USE_FLUIDSYNTH_LIB and not USE_FLUIDSYNTH_CMD:
    print("‚ö†Ô∏è  fluidsynth not found. Options:")
    print("   1. Install Python library: pip install pyfluidsynth")
    print("   2. Add fluidsynth to system PATH")
    print(f"   3. Update CUSTOM_FLUIDSYNTH_PATH in this cell if your path is different")
    print(f"   Current custom path: {CUSTOM_FLUIDSYNTH_PATH}")
else:
    print("‚úÖ Audio conversion setup complete")


‚úÖ Found fluidsynth at custom path: C:\Users\Vikas Gari\Downloads\fluidsynth-v2.5.1-win10-x64-cpp11\fluidsynth-v2.5.1-win10-x64-cpp11\bin\fluidsynth.exe
   Version: FluidSynth runtime version 2.5.1
Copyright (C) 2000-2025 Peter Hanappe and others.
Distributed under the LGPL license.
SoundFont(R) is a registered trademark of Creative Technology Ltd.

FluidSynth executable version 2.5.1
Sample type=float
‚úÖ Audio conversion setup complete


In [13]:
# List available soundfonts
SOUNDFONT_DIR = Path("sound_fonts")
WAV_OUTPUT_DIR = OUTPUT_DIR  # Save WAV files alongside MIDI files

soundfont_files = sorted(list(SOUNDFONT_DIR.glob("*.sf2")) + list(SOUNDFONT_DIR.glob("*.SF2")))

if not soundfont_files:
    print("‚ö†Ô∏è  No soundfont files found in sound_fonts/ directory")
else:
    print(f"‚úÖ Found {len(soundfont_files)} soundfont files")
    print(f"\nFirst 10 soundfonts:")
    for i, sf in enumerate(soundfont_files[:10], 1):
        print(f"   {i}. {sf.name}")
    if len(soundfont_files) > 10:
        print(f"   ... and {len(soundfont_files) - 10} more")
    
    print(f"\nüí° You can select a specific soundfont or use 'random' to pick one automatically")


‚úÖ Found 1000 soundfont files

First 10 soundfonts:
   1. 16-Bit_Game_Station.sf2
   2. 16-Bit_Game_Station.sf2
   3. 2MBGMGSMT.sf2
   4. 2MBGMGSMT.sf2
   5. 32MbGMStereo.sf2
   6. 32MbGMStereo.sf2
   7. 4MBGM_Plus12.sf2
   8. 4MBGM_Plus12.sf2
   9. 4MBGMGSMT.sf2
   10. 4MBGMGSMT.sf2
   ... and 990 more

üí° You can select a specific soundfont or use 'random' to pick one automatically


In [14]:
# ============================================
# SOUNDFONT SELECTION
# ============================================
# Choose a soundfont to use for MIDI to WAV conversion
# Options:
# - 'random': Randomly select from available soundfonts
# - Integer index: Use soundfont at that index (0-based)
# - String name: Use soundfont with that exact filename
# - None: Skip WAV generation

SOUNDFONT_SELECTION = 'random'  # Change this to select a specific soundfont
# SOUNDFONT_SELECTION = 0  # Use first soundfont
# SOUNDFONT_SELECTION = 'FluidR3_GM.sf2'  # Use specific soundfont by name
# SOUNDFONT_SELECTION = None  # Skip WAV generation

if soundfont_files and SOUNDFONT_SELECTION is not None:
    if SOUNDFONT_SELECTION == 'random':
        selected_soundfont = random.choice(soundfont_files)
        print(f"üéπ Randomly selected soundfont: {selected_soundfont.name}")
    elif isinstance(SOUNDFONT_SELECTION, int):
        if 0 <= SOUNDFONT_SELECTION < len(soundfont_files):
            selected_soundfont = soundfont_files[SOUNDFONT_SELECTION]
            print(f"üéπ Selected soundfont (index {SOUNDFONT_SELECTION}): {selected_soundfont.name}")
        else:
            print(f"‚ö†Ô∏è  Invalid index {SOUNDFONT_SELECTION}. Using random instead.")
            selected_soundfont = random.choice(soundfont_files)
            print(f"   Using: {selected_soundfont.name}")
    elif isinstance(SOUNDFONT_SELECTION, str):
        # Find by name
        found = None
        for sf in soundfont_files:
            if sf.name.lower() == SOUNDFONT_SELECTION.lower():
                found = sf
                break
        if found:
            selected_soundfont = found
            print(f"üéπ Selected soundfont: {selected_soundfont.name}")
        else:
            print(f"‚ö†Ô∏è  Soundfont '{SOUNDFONT_SELECTION}' not found. Using random instead.")
            selected_soundfont = random.choice(soundfont_files)
            print(f"   Using: {selected_soundfont.name}")
    else:
        selected_soundfont = None
        print("‚ö†Ô∏è  Invalid SOUNDFONT_SELECTION. Skipping WAV generation.")
else:
    selected_soundfont = None
    if not soundfont_files:
        print("‚ö†Ô∏è  No soundfonts available. Skipping WAV generation.")


üéπ Randomly selected soundfont: General_MIDI_64_1.6.sf2


In [15]:
def midi_to_wav(midi_path: Path, soundfont_path: Path, wav_output_path: Path, sample_rate=44100):
    """
    Convert MIDI file to WAV using a soundfont.
    
    Args:
        midi_path: Path to input MIDI file
        soundfont_path: Path to SF2 soundfont file
        wav_output_path: Path to output WAV file
        sample_rate: Audio sample rate (default 44100 Hz)
    
    Returns:
        True if successful, False otherwise
    """
    # Try command-line first (most reliable)
    if USE_FLUIDSYNTH_CMD and FLUIDSYNTH_CMD:
        try:
            # Using fluidsynth command-line tool
            # Command: fluidsynth -a file -F output.wav soundfont.sf2 input.mid
            cmd = [
                FLUIDSYNTH_CMD,
                '-a', 'file',  # Audio driver: file (write to WAV)
                '-F', str(wav_output_path),  # Output file
                '-r', str(sample_rate),  # Sample rate
                str(soundfont_path),  # Soundfont file
                str(midi_path)  # MIDI file
            ]
            
            result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
            
            if result.returncode == 0:
                return True
            else:
                print(f"‚ö†Ô∏è  fluidsynth error: {result.stderr}")
                if result.stdout:
                    print(f"   Output: {result.stdout}")
                return False
        except FileNotFoundError:
            print(f"‚ö†Ô∏è  fluidsynth not found at: {FLUIDSYNTH_CMD}")
            print(f"   Please check the CUSTOM_FLUIDSYNTH_PATH in the setup cell")
            return False
        except subprocess.TimeoutExpired:
            print("‚ö†Ô∏è  fluidsynth conversion timed out (took longer than 5 minutes)")
            return False
        except Exception as e:
            print(f"‚ö†Ô∏è  Error running fluidsynth: {e}")
            return False
    
    # Try Python library as fallback (if available)
    if USE_FLUIDSYNTH_LIB:
        try:
            import fluidsynth
            import numpy as np
            
            # Initialize synthesizer
            fs = fluidsynth.Synth(samplerate=float(sample_rate))
            sfid = fs.sfload(str(soundfont_path))
            fs.program_select(0, sfid, 0, 0)
            fs.start()
            
            # Read MIDI file and process events
            mid = mido.MidiFile(midi_path)
            sample_rate = float(sample_rate)
            total_time = mid.length  # Duration in seconds
            total_samples = int(total_time * sample_rate)
            
            # Create audio buffer
            samples = np.zeros((total_samples, 2), dtype=np.float32)
            
            # Process MIDI events
            current_time = 0.0
            for msg in mid:
                current_time += msg.time
                sample_idx = int(current_time * sample_rate)
                
                if msg.type == 'note_on' and msg.velocity > 0:
                    fs.noteon(msg.channel if hasattr(msg, 'channel') else 0, msg.note, msg.velocity)
                elif msg.type == 'note_off' or (msg.type == 'note_on' and msg.velocity == 0):
                    fs.noteoff(msg.channel if hasattr(msg, 'channel') else 0, msg.note)
                elif msg.type == 'program_change':
                    fs.program_select(msg.channel if hasattr(msg, 'channel') else 0, sfid, 0, msg.program)
            
            # Render audio
            fs.get_samples(samples)
            
            # Convert to 16-bit PCM and save as WAV
            import wave
            import struct
            
            # Normalize and convert to int16
            max_val = np.abs(samples).max()
            if max_val > 0:
                samples = samples / max_val * 0.95  # Prevent clipping
            samples_int16 = (samples * 32767).astype(np.int16)
            
            # Write WAV file
            with wave.open(str(wav_output_path), 'wb') as wav_file:
                wav_file.setnchannels(2)  # Stereo
                wav_file.setsampwidth(2)  # 16-bit
                wav_file.setframerate(int(sample_rate))
                wav_file.writeframes(samples_int16.tobytes())
            
            fs.stop()
            return True
            
        except Exception as e:
            print(f"‚ö†Ô∏è  Error with fluidsynth library: {e}")
            print("   Falling back to command-line method if available...")
            # Don't return False yet, try command-line
    
    # If we get here, both methods failed
    if not USE_FLUIDSYNTH_CMD:
        return False
    else:
        return False  # Command-line already tried above

print("‚úÖ WAV conversion function defined")


‚úÖ WAV conversion function defined


In [16]:
# Convert MIDI to WAV if soundfont is selected
if selected_soundfont and output_path.exists():
    print(f"\nüéµ Converting MIDI to WAV using soundfont: {selected_soundfont.name}")
    print("=" * 60)
    
    # Generate WAV filename
    wav_filename = output_path.stem + ".wav"
    wav_output_path = WAV_OUTPUT_DIR / wav_filename
    
    # Convert
    success = midi_to_wav(
        midi_path=output_path,
        soundfont_path=selected_soundfont,
        wav_output_path=wav_output_path,
        sample_rate=44100  # CD quality
    )
    
    if success and wav_output_path.exists():
        file_size_mb = wav_output_path.stat().st_size / (1024 * 1024)
        print(f"\n‚úÖ WAV file saved: {wav_output_path}")
        print(f"   File size: {file_size_mb:.2f} MB")
        print(f"   Sample rate: 44100 Hz")
        print(f"   Soundfont: {selected_soundfont.name}")
        print(f"\nüéß You can now listen to the generated audio!")
    else:
        print(f"\n‚ö†Ô∏è  WAV conversion failed. MIDI file is still available at: {output_path}")
        print(f"   Try installing fluidsynth or check soundfont file: {selected_soundfont}")
else:
    if not selected_soundfont:
        print("\n‚ö†Ô∏è  No soundfont selected. Skipping WAV generation.")
        print(f"   MIDI file is available at: {output_path}")
    elif not output_path.exists():
        print(f"\n‚ö†Ô∏è  MIDI file not found: {output_path}")
        print("   Please run the MIDI generation cell first.")



üéµ Converting MIDI to WAV using soundfont: General_MIDI_64_1.6.sf2

‚úÖ WAV file saved: generated_midi\generated_classical_chopin_20251103_110342.wav
   File size: 3.70 MB
   Sample rate: 44100 Hz
   Soundfont: General_MIDI_64_1.6.sf2

üéß You can now listen to the generated audio!
