<h1><b>COMP 432 - Machine Learning</b> | <b>Final Project</b> (W2025)</h1>
<h3>Machine Learning Model to detect Copyright Infringement in Music</h3>
Written by Sisahga Phimmasone - 40210015<br>
April 20th, 2025


<h3><b>1. Data Preparation and Cleaning</b></h3>

The code cells below convert YouTube video links of pairs of songs from the list of historically known copyright infringement cases to MP3.

https://en.wikipedia.org/wiki/List_of_songs_subject_to_plagiarism_disputes

See 'copyright_cases.csv' file.
Uses yt_dlp library to output YouTube video URLs to .mp3

<h4><b>1.1 Convert YouTube URLs to MP3 with yt_dlp</b></h4>

In [None]:
import yt_dlp
import pandas as pd
import os
import re

In [None]:
mp3_song_dir = "songs_mp3"
os.makedirs(mp3_song_dir, exist_ok=True)
df = pd.read_csv("copyright_cases.csv")

In [None]:
# YT-DLP options
def get_opts(output_path):
    return {
        'format': 'bestaudio/best',
        'outtmpl': output_path,
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'quiet': False,
        'noplaylist': True,
    }

In [None]:
# Creates a clean filename for the mp3 files
def clean_filename(s):
    return re.sub(r'[\\/*?:"<>|]', "", s.replace(" ", "_"))

In [None]:
# Download Loop, record by record
for index, row in df.iterrows():
    orig_song = clean_filename(row['Original Song'])
    orig_artist = clean_filename(row['Original Artist'])
    second_song = clean_filename(row['Second Song'])
    second_artist = clean_filename(row['Second Artist'])
    orig_url = row['Original Song YouTube Link']
    second_url = row['Second Song Youtube Link']

    row_id = f"{index+1:03d}"

    # Build filenames
    orig_filename = os.path.join(
        mp3_song_dir,
        f"{row_id}_original_{orig_song}_by_{orig_artist}.%(ext)s"
    )
    second_filename = os.path.join(
        mp3_song_dir,
        f"{row_id}_second_{second_song}_by_{second_artist}.%(ext)s"
    )

    try:
        with yt_dlp.YoutubeDL(get_opts(orig_filename)) as ydl:
            print(f"Downloading original: {orig_song} by {orig_artist}")
            ydl.download([orig_url])
    except Exception as e:
        print(f"Failed to download original song at {orig_url}: {e}")

    try:
        with yt_dlp.YoutubeDL(get_opts(second_filename)) as ydl:
            print(f"Downloading second: {second_song} by {second_artist}")
            ydl.download([second_url])
    except Exception as e:
        print(f"Failed to download second song at {second_url}: {e}")

In [None]:
# Check how many files were downloaded (should be 176)
mp3_files = [f for f in os.listdir(mp3_song_dir) if f.lower().endswith(".mp3")]
print(f"Total MP3 files downloaded: {len(mp3_files)}")

<h4><b>1.2 Create Data Structure for known copyrighted song pairings</b></h4>

In [None]:
import numpy as np
import librosa
import pickle

In [None]:
# Librosa Configuration
AUDIO_DIR = mp3_song_dir
SR = 16000
DURATION = 10
TARGET_LEN = DURATION * SR

In [29]:
def preprocess_audio(audio_path):
    """Preprocesses .mp3 audio files with specified Librosa Configuration above. Splits each song into chunks of 10 seconds.

    :param audio_path: Path to audio file
    :return list: A list of numpy arrays, where each array is a 10-second audio chunk of size TARGET_LEN of a song.
    """

    audio, sr = librosa.load(audio_path, sr=None)

    # Trim silent parts (set to top off lower than 20DB)
    audio, _ = librosa.effects.trim(audio, top_db=20)

    # Resample to 16kHz for Wave2Vec2
    if sr != SR:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=SR)

    # Split song into 10-second chunks of 16kHz
    segments = []
    for i in range(0, len(audio), TARGET_LEN):
        segment = audio[i: (i + TARGET_LEN)]
        # If the segment is shorter than TARGET_LEN (i.e. last part of song), pad with zeros for uniformity
        if len(segment) < TARGET_LEN:
            segment = np.pad(segment, (0, TARGET_LEN - len(segment)), mode='constant')

        segments.append(segment)

    return segments

In [None]:
def get_pair_id(audio_filename):
    """Retrieves pairs of known copyrighted cases from the songs_mp3 directory.

    :param audio_filename: Path to audio file
    :return: pair_id (str): The pair ID (i.e. "001")
    """
    match = re.match(r"(\d{3})_", audio_filename)
    return match.group(1) if match else None

In [None]:
# Group song mp3 files by pair ID
pairs = {}
for fname in os.listdir(mp3_song_dir):
    if fname.endswith(".mp3"):
        pair_id = get_pair_id(fname)
        if pair_id:
            pairs.setdefault(pair_id, []).append(fname)

In [None]:
# Build dataset from the pairs dictionary intialized in the cell above, preprocess them, and save to pickle file
dataset = []
preprocessed_output_dir = "preprocessed_pairs"
os.makedirs(preprocessed_output_dir, exist_ok=True)

for pair_id, files in pairs.items():
    if len(files) < 2:
        print(f"Skipping {pair_id} expected 2 files but got {len(files)}")
        continue

    files = sorted(files) # Original songs [0], copies [1]
    og_song = os.path.join(mp3_song_dir, files[0])
    copy_song = os.path.join(mp3_song_dir, files[1])

    print(f"Processing pair {pair_id}...")
    segments1 = preprocess_audio(og_song)
    print(f"Preprocessed original song segments: {len(segments1)}")
    segments2 = preprocess_audio(copy_song)
    print(f"Preprocessed copied song segments: {len(segments2)}")

    label = True # True since we are preprocessing all pairs of known copyright infringement cases

    pair_data = {
        "pair_id": pair_id,
        "original_song_segments": segments1,
        "copied_song_segments": segments2,
        "label": label,
    }

    with open(os.path.join(preprocessed_output_dir, f"{pair_id}.pkl"), "wb") as f:
        pickle.dump(pair_data, f)
    print(f"Saved pair {pair_id} to pickle.")

    dataset.append(pair_data)