In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('csv_files/tracks.csv', low_memory=False)

In [None]:
df.columns = df.iloc[0]
df = df[2:]
df = df.reset_index(drop=True)
df.columns = df.columns.fillna('track_id')

In [None]:
df.head()

In [None]:
df = df[df['subset'] == 'small']
df

In [None]:
def transform_path(val):
    val = str(val)
    if val.isdigit():
        val = val.zfill(6)
        val = val[:3] + "/" + val + ".mp3"
    return val

df['path_to_audio'] = df['track_id'].apply(transform_path)
df['path_to_audio'].unique()

prob_no_noise = 0.6
prob_small_noise = 0.2
prob_medium_noise = 0.2

noise_ratios = {
    'none': 0.0,
    'small': 0.01,
    'medium': 0.05
}

def augment_row_with_random_segments(row):
    all_rows = []

    orig = row.copy()
    rand_start = np.random.randint(10, 21)
    rand_end = np.random.choice([2, 5, 10])
    orig['start'] = rand_start * 16000
    orig['end'] = rand_start * 16000 + rand_end * 16000
    all_rows.append(orig)

    for _ in range(2):
        mod = row.copy()
        rand_start = np.random.randint(5, 25)
        rand_end = np.random.choice([2, 5, 10])
        mod['start'] = rand_start * 16000
        mod['end'] = rand_start * 16000 + rand_end * 16000
        all_rows.append(mod)

    return all_rows

augmented_rows = df.apply(augment_row_with_random_segments, axis=1).explode().tolist()
df = pd.DataFrame([row.to_dict() for row in augmented_rows])

def assign_noise_ratio():
    choice = np.random.choice(
        ['none', 'small', 'medium'],
        p=[prob_no_noise, prob_small_noise, prob_medium_noise]
    )
    return noise_ratios[choice]

df['noise_ratio'] = df.apply(lambda _: assign_noise_ratio(), axis=1)

In [None]:
df.head(10)

In [None]:
train_df = df[df['split'] == 'training']
val_df = df[df['split'] == 'validation']
test_df = df[df['split'] == 'test']

In [None]:
train_df['path_to_audio'].unique()

In [None]:
val_df

In [None]:
test_df

In [None]:
train_df.columns

In [None]:
train_df['path_to_audio']

In [None]:
columns_to_keep = ['track_id', 'path_to_audio', 'genre_top', 'noise_ratio', 'start', 'end']
train_df[columns_to_keep]
train_df = train_df[columns_to_keep]
val_df = val_df[columns_to_keep]
test_df = test_df[columns_to_keep]

In [None]:
train_df.to_csv('csv_files/train.csv', index=False)
val_df.to_csv('csv_files/val.csv', index=False)
test_df.to_csv('csv_files/test.csv', index=False)