<a href="https://colab.research.google.com/github/Redsu1951/video-bgm-generation/blob/main/finalprojecttelbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Demo of Controllable Music Transformer**

We provide a colab notebook for running inference with CMT. You can upload a video and generate a background music using this notebook.

# 1. Preparation

Clone the repo

In [None]:
import os
from google.colab import files
import json

os.chdir('/content')
!git clone https://github.com/Redsu1951/video-bgm-generation
os.chdir('/content/video-bgm-generation')

In [None]:
# Environment Setup - These should be run first
import os
os.environ['XDG_RUNTIME_DIR'] = "/tmp/runtime"
os.environ["ALSA_CARD"] = "hw:0"
os.environ["SDL_AUDIODRIVER"] = "dummy"

# Install System Dependencies
!apt-get update && apt-get install -y libfluidsynth1 build-essential libasound2-dev libjack-dev fluidsynth libsndfile1 ffmpeg

# Install Python Dependencies
!pip install --upgrade pip
!pip install pytorch-fast-transformers==0.4.0
!pip install -r py3_requirements.txt
!pip install skvideo moviepy==1.0.3 muspy numpy==1.23.5

# Install Additional Packages if needed
!pip install ffmpeg

# Download Required Files from Google Cloud Storage
!gsutil -m cp gs://cmt/loss_8_params.pt /content/video-bgm-generation/exp/
!gsutil -m cp gs://magentadata/soundfonts/SGM-v2.01-Sal-Guit-Bass-V1.3.sf2 /content/video-bgm-generation/

# Navigate to the appropriate directory and install additional modules
os.chdir("/content/video-bgm-generation/src/video2npz/visbeat3/")
!python setup.py install

In [None]:
!pip install muspy

In [None]:
import os
import telebot
import traceback
import subprocess
import shutil

# Telegram Bot Token
BOT_TOKEN = '7904087543:AAHHnMGSu9jh0SP4tRrGWGhL5lTT5pbEs9U'

# Paths
PROJECT_ROOT = '/content/video-bgm-generation'
VIDEOS_DIR = os.path.join(PROJECT_ROOT, 'videos')
INFERENCE_DIR = os.path.join(PROJECT_ROOT, 'inference')
VIDEO2NPZ_DIR = os.path.join(PROJECT_ROOT, 'src/video2npz')

# Ensure required directories exist
os.makedirs(VIDEOS_DIR, exist_ok=True)
os.makedirs(INFERENCE_DIR, exist_ok=True)

# Initialize Telegram Bot
bot = telebot.TeleBot(BOT_TOKEN)

@bot.message_handler(commands=['start'])
def send_welcome(message):
    bot.reply_to(message, "👋 Hello! Send me a video, and I'll process it.")

@bot.message_handler(content_types=['video'])
def handle_video(message):
    try:
        # Step 1: Clear all necessary folders before processing a new video
        clear_videos_folder()
        clear_inference_folder()
        clear_processing_folders()
        print("✅ Cleared all necessary folders.")

        bot.reply_to(message, "📥 Received your video. Downloading now...")

        # Step 2: Download video from Telegram
        file_info = bot.get_file(message.video.file_id)
        downloaded_file = bot.download_file(file_info.file_path)

        # Step 3: Save original video
        original_video_path = os.path.join(VIDEOS_DIR, 'original_video.mp4')
        with open(original_video_path, 'wb') as new_file:
            new_file.write(downloaded_file)

        # Step 4: Re-encode video and convert to 360p
        reencoded_video_path = os.path.join(VIDEOS_DIR, 'test_raw.mp4')
        convert_video_to_360p(original_video_path, reencoded_video_path)

        converted_video_path = os.path.join(VIDEOS_DIR, 'test.mp4')
        convert_video_to_360p(reencoded_video_path, converted_video_path)

        bot.reply_to(message, "✅ Video processed successfully! Now generating music...")

        # Step 5: Generate MIDI file
        midi_file_path = generate_music_from_video(converted_video_path)

        bot.reply_to(message, "🎶 MIDI file has been generated! Sending it now...")

        # Step 6: Send MIDI file to user
        with open(midi_file_path, 'rb') as midi_file:
            bot.send_document(message.chat.id, midi_file)

        bot.reply_to(message, "✅ MIDI file sent successfully!")

    except Exception as e:
        bot.reply_to(message, f"❌ Error: {str(e)}")
        print("Error details:")
        traceback.print_exc()

def clear_videos_folder():
    """Remove all files in the videos folder but keep the folder itself."""
    if os.path.exists(VIDEOS_DIR):
        for filename in os.listdir(VIDEOS_DIR):
            file_path = os.path.join(VIDEOS_DIR, filename)
            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.remove(file_path)  # Delete files or symlinks
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)  # Delete subfolders if any
            except Exception as e:
                print(f"Error clearing {file_path}: {e}")

def clear_inference_folder():
    """Remove all files in the inference folder before generating a new MIDI file."""
    if os.path.exists(INFERENCE_DIR):
        for filename in os.listdir(INFERENCE_DIR):
            file_path = os.path.join(INFERENCE_DIR, filename)
            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.remove(file_path)  # Delete files or symlinks
            except Exception as e:
                print(f"Error clearing {file_path}: {e}")

def clear_processing_folders():
    """Remove processing-related folders inside src/video2npz."""
    directories_to_remove = ["VisBeatAssets", "fig", "flow", "image", "optical_flow"]
    for directory in directories_to_remove:
        dir_path = os.path.join(VIDEO2NPZ_DIR, directory)
        if os.path.exists(dir_path):
            try:
                shutil.rmtree(dir_path)  # Delete the folder
                print(f"Removed {dir_path}")
            except Exception as e:
                print(f"Error removing {dir_path}: {e}")
        else:
            print(f"Skipping {dir_path}, does not exist.")

def convert_video_to_360p(input_video_path, output_video_path):
    """Convert the video to 360p resolution using ffmpeg."""
    try:
        print(f"Converting {input_video_path} to 360p...")
        subprocess.run(['ffmpeg', '-i', input_video_path, '-strict', '-2', '-vf', 'scale=-1:360', output_video_path], check=True)
        print(f"✅ Saved 360p video to {output_video_path}")
    except Exception as e:
        print(f"Error during conversion: {e}")
        raise e

def generate_music_from_video(converted_video_path):
    """Run the model pipeline to generate music following the correct order."""
    try:
        os.chdir(VIDEO2NPZ_DIR)

        # Step 1: Extract optical flow
        subprocess.run(['python', 'optical_flow.py', '--video', '/content/video-bgm-generation/videos/test.mp4', '--method', 'farneback'], check=True)

        # Step 2: Generate metadata
        subprocess.run(['python3', os.path.join(VIDEO2NPZ_DIR, 'video2metadata.py'), '--video', '/content/video-bgm-generation/videos/test.mp4'], check=True)

        subprocess.run(['python', os.path.join(VIDEO2NPZ_DIR, 'metadata2numpy_mix.py'),
                        '--metadata', os.path.join(VIDEO2NPZ_DIR, 'metadata.json'),
                        '--video', '/content/video-bgm-generation/videos/test.mp4',
                        '--out_dir', INFERENCE_DIR], check=True)

        # Step 3: Generate MIDI file
        os.chdir("/content/video-bgm-generation/src/")
        subprocess.run(['python', 'gen_midi_conditional.py',
                        '-f', os.path.join(INFERENCE_DIR, 'test.npz'),
                        '-c', os.path.join(PROJECT_ROOT, 'exp/loss_8_params.pt'),
                        '-n', '1'], check=True)

        print("✅ MIDI file successfully generated!")

        # Step 4: Return the path of the generated MIDI file
        midi_file_path = os.path.join(INFERENCE_DIR, 'test.npz_0.mid')
        if os.path.exists(midi_file_path):
            return midi_file_path
        else:
            raise FileNotFoundError("MIDI file was not generated!")

    except Exception as e:
        print(f"Error during music generation: {e}")
        raise e

bot.polling(none_stop=True)


In [None]:
os.chdir("/content/video-bgm-generation/src/")
!python gen_midi_conditional.py -f "../inference/test.npz" -c "../exp/loss_8_params.pt" -n 1

Download checkpoint and soundfont



In [None]:
!gsutil -m cp gs://cmt/loss_8_params.pt /content/video-bgm-generation/exp/
!gsutil -m cp gs://magentadata/soundfonts/SGM-v2.01-Sal-Guit-Bass-V1.3.sf2 /content/video-bgm-generation/

Install dependencies

In [None]:
!apt-get update && apt-get install libfluidsynth1 build-essential libasound2-dev libjack-dev fluidsynth

In [None]:
import os

In [None]:
!pip install --upgrade pip
# this may take ~15 minutes
!pip install pytorch-fast-transformers==0.4.0
# Note: Version of pytorch-fast-transformers is tricky - depends on your randomly assigned colab GPU, it could be 0.3.0 or 0.4.0 or others.
# Incorrect fast-transformers version could lead to Errors or generating awful results for unknown reasons,
# so you should try different versions, or refer to https://github.com/idiap/fast-transformers

!pip install -r py3_requirements.txt
os.chdir("/content/video-bgm-generation/src/video2npz/visbeat3/")
!python setup.py install

# 2. Process input video

Upload your video

It is recommended to use videos **less than 2 minutes**, otherwise it gets really slow

In [None]:
os.chdir("/content/video-bgm-generation/")
uploaded = files.upload()
assert len(uploaded) == 1, "upload one video file only"
filename = list(uploaded.keys())[0]
os.system(f'mv {filename} videos/test_raw.mp4')

Convert to 360p to speed up extracting optical flow and visbeats

In [None]:
os.chdir("/content/video-bgm-generation/videos/")
!rm test.mp4
!ffmpeg -i test_raw.mp4 -strict -2 -vf scale=-1:360 test.mp4

Extracting optical flow and visbeats, convert video into npz file

In [None]:
os.chdir("/content/video-bgm-generation/src/video2npz/")
!rm -r VisBeatAssets/ fig/ flow/ image/ optical_flow/

# extracting optical flow and visbeats may be slow

In [None]:
!pip install ffmpeg
!pip install skvideo


In [None]:
!python optical_flow.py --video '/content/video-bgm-generation/videos/test.mp4' --method 'farneback'


In [None]:
!pip install moviepy==1.0.3


In [None]:
os.chdir("/content/video-bgm-generation/src/video2npz/")

In [None]:
!pip install numpy==1.24


In [None]:
import os
os.environ['XDG_RUNTIME_DIR'] = "/tmp/runtime"


In [None]:
import os
os.environ["ALSA_CARD"] = "hw:0"
os.environ["SDL_AUDIODRIVER"] = "dummy"


In [None]:
!apt-get install -y libsndfile1


In [None]:
!python3 /content/video-bgm-generation/src/video2npz/video2metadata.py --video /content/video-bgm-generation/videos/test.mp4


In [None]:
!python /content/video-bgm-generation/src/video2npz/metadata2numpy_mix.py --metadata /content/video-bgm-generation/src/video2npz/metadata.json --video /content/video-bgm-generation/videos/test.mp4 --out_dir /content/video-bgm-generation/inference


In [None]:
!pip install muspy


# 3. Run the model to generate background music

Run inference to generate MIDI (.mid) output

In [None]:
os.chdir("/content/video-bgm-generation/src/")
!python gen_midi_conditional.py -f "../inference/test.npz" -c "../exp/loss_8_params.pt" -n 1

In [None]:
from google.colab import files


Convert midi into audio: use **GarageBand (recommended)** or midi2audio

Remember to **set tempo to the value of tempo in video2npz/metadata.json**

In [None]:
from google.colab import files
import os
import json

# Change directory
os.chdir("/content/video-bgm-generation/src/")

# Download the file
files.download('../inference/test.npz_0.mid')

# Load metadata and print tempo
with open("video2npz/metadata.json") as f:
    tempo = json.load(f)['tempo']
    print("tempo:", tempo)


In [None]:
!apt-get update
!apt-get install -y fluidsynth

In [None]:
!pip install pyfluidsynth

In [None]:
!pip install --force-reinstall pyfluidsynth

In [None]:
import pyfluidsynth
print(pyfluidsynth.__version__)  # This should print the version if installed correctly

Generate audio with midi2audio

Instead of running this cell, we recommend using GarageBand or other softwares, since their soundfonts are better. But this also works fine

In [None]:
import note_seq
from pretty_midi import PrettyMIDI
import pyfluidsynth
import numpy as np
import os

SAMPLE_RATE = 16000
SF2_PATH = '/content/video-bgm-generation/SGM-v2.01-Sal-Guit-Bass-V1.3.sf2'
os.chdir("/content/video-bgm-generation/inference/")

# Load MIDI file
input_mid = 'test.npz_0.mid'
midi_obj = PrettyMIDI(input_mid)

# Convert tempo (adjust if necessary)
tempo = 120  # Adjust tempo if needed
midi_length = midi_obj.get_end_time()
midi_obj.adjust_times([0, midi_length], [0, midi_length*120/tempo])
processed_mid = input_mid[:-4] + "_processed.mid"
midi_obj.write(processed_mid)

# Convert MIDI to audio
fs = pyfluidsynth.Synth()
fs.sfload(SF2_PATH)
fs.start(driver="alsa")  # Ensures audio output works correctly
fs.midi_to_audio(processed_mid, "music.wav")

print("Audio generated as 'music.wav'")


Combine original video and audio into video with BGM

Generate/upload the audio file under `inference`, name it as `music.mp3`, and run this to combine video and music

In [None]:
os.chdir("/content/video-bgm-generation/inference/")
!rm output.mp4
!ffmpeg -i ../videos/test_raw.mp4 -i music.mp3 -c:v copy -c:a aac -strict experimental -map 0:v:0 -map 1:a:0 output.mp4
files.download('output.mp4')