# Install Dependencies

MusicGen needs a lot of memory. According to meta, you need about 16G memory to run a medium model (though during my experiments, I just need 5~10G).

If there is an out of memory problem, you need to try smaller model or enlarge your memory.

In [None]:
!git clone https://github.com/ObsisMc/audiocraft_watermark
%cd audiocraft_watermark
!pip install -e .

Cloning into 'audiocraft_watermark'...
remote: Enumerating objects: 1179, done.[K
remote: Counting objects: 100% (1179/1179), done.[K
remote: Compressing objects: 100% (493/493), done.[K
remote: Total 1179 (delta 670), reused 1179 (delta 670), pack-reused 0[K
Receiving objects: 100% (1179/1179), 1.43 MiB | 19.54 MiB/s, done.
Resolving deltas: 100% (670/670), done.
/content/audiocraft_watermark
Obtaining file:///content/audiocraft_watermark
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting av (from audiocraft==1.3.0a1)
  Downloading av-11.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (32.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting einops (from audiocraft==1.3.0a1)
  Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting flas

In [None]:
from audiocraft.models import MusicGen
import soundfile as sf
import torch

from watermark.watermark_processor import WatermarkAudioDetector

from IPython.display import Audio

# Define necessary functions

In [None]:
# read and load audio
def read_wav(path):
    audio = sf.read(path)
    return audio

def save_wav(audio, sr, name):
    if isinstance(audio, torch.Tensor):
        audio = audio.detach().cpu().numpy()

    # audio need to be two dimension (T, C) and T must be larger than 3
    if len(audio.shape) == 3:  # (B, ...)
        audio = audio[0]  # only get the first batch
    elif len(audio.shape) == 1:
        audio = audio[..., None]

    if audio.shape[1] > 3:  # (C, T)
        audio = audio.T

    sf.write(name, audio, sr)

In [None]:
# init model
def init_model(top_k=250, duration=30, watermark_model=False):
    model = MusicGen.get_pretrained('facebook/musicgen-small')  # use small in case out of memory

    model.set_generation_params(
        use_sampling=True,
        top_k=top_k,
        duration=duration,
        watermark_mode=watermark_model
    )

    print(f"Number of codebooks: {model.compression_model.quantizer.total_codebooks}",
          f"Size of codebooks {model.compression_model.quantizer.bins}")
    return model


# init watermark detector
def init_detector(model):
    lm = model.lm
    encodec = model.compression_model

    detector = WatermarkAudioDetector(vocab=[i for i in range(lm.vocab_size)],
                      gamma=lm.gamma_wm, # should match original setting
                      seeding_scheme=lm.seeding_scheme_wm, # should match original setting
                      device=model.device, # must match the original rng device type
                      z_threshold=4.0,
                      compression_model=encodec,
                      layer_wm=lm.layer_wm
                      # ignore_repeated_ngrams=True
                      )
    return detector

In [None]:
# generate and detect functions
def generate_audio(model):
    output = model.generate(
        descriptions=[
            #'80s pop track with bassy drums and synth',
            #'90s rock song with loud guitars and heavy drums',
            # 'Progressive rock drum and bass solo',
            #'Punk Rock song with loud drum and power guitar',
            'Bluesy guitar instrumental with soulful licks and a driving rhythm section',
            #'Jazz Funk song with slap bass and powerful saxophone',
            # 'drum and bass beat with intense percussions'
        ],
        progress=True, return_tokens=True
        )

    return output[0]


def detect_audio(model, audio_path=None, audio=None):
    assert (audio_path is not None) ^ (audio is not None)

    detector = init_detector(model)

    # load audio, the audio should be (C, T)
    if audio_path:
        audio, sr = read_wav(audio_path)  # only 1 channel audio
        audio = torch.tensor(audio[None, ...], dtype=torch.float32).to(model.device)
    else:
        # suppose audio is (B, C, T)
        if len(audio.shape) == 3:
            audio = audio[0]  # get the first audio in a batch


    score_dict = detector.detect(audio) # or any other text of interest to analyze

    print()
    info_print = '\n'.join([str(k) + ':' + str(v) for k, v in score_dict.items()])
    print(f"Detection results:\n{'='*20}\n",
      f"{info_print}",
      f"\n{'='*20}")

    return audio, score_dict

# Generate Audio without and with watermark

## Audio without watermark

In [None]:
# generates audio without watermark
duration = 15
model = init_model(duration=duration, watermark_model=False)  # without watermark
audio = generate_audio(model)
audio, score_dict = detect_audio(model, audio=audio)
save_wav(audio, 32000, f"audio_generated_{model.duration}s.wav") # lets save it, musicgen use 32k sample rate

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


state_dict.bin:   0%|          | 0.00/841M [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

compression_state_dict.bin:   0%|          | 0.00/236M [00:00<?, ?B/s]



Number of codebooks: 4 Size of codebooks 2048

Detection results:
 num_tokens_scored:733
num_green_tokens:365
green_fraction:0.4979536152796726
z_score:-0.11080752827296766
p_value:0.5441155122897663
prediction:False 


We can look at the `Detection results` printed above and the `prediction` shows there is no watermark. For more details of other output like `num_tokens_scored`, please read the paper and look at the official code.

In [None]:
# lets hear it
audio_np = audio[0].T.detach().cpu().numpy()
Audio(audio_np, rate=32000)

  audio_np = audio[0].T.detach().cpu().numpy()


## Audio with watermark

In [None]:
# generates audio with watermark, almost the same as the code to generate without waternark, except for the parameter of init_model()
duration = 15
model = init_model(duration=duration, watermark_model=True)  # with watermark
audio = generate_audio(model)
audio, score_dict = detect_audio(model, audio=audio)
save_wav(audio, 32000, f"audio_watermarked_{duration}s.wav")



Number of codebooks: 4 Size of codebooks 2048

Detection results:
 num_tokens_scored:733
num_green_tokens:493
green_fraction:0.6725784447476125
z_score:9.34476821768694
p_value:4.6045476874307725e-21
prediction:True
confidence:1.0 


In [None]:
# lets hear it
audio_np = audio[0].T.detach().cpu().numpy()
Audio(audio_np, rate=32000)