In [1]:
import librosa
import pandas as pd
pd.set_option('display.max_rows', 10000)
import sys
sys.path.append("../../")
import os

from functions import extract_features


## Get the data
+ [Download the GTZAN genre collection dataset](http://opihi.cs.uvic.ca/sound/genres.tar.gz) (Approximately 1.2GB)

+ If needed, adapt the paths to your data structure. 

In [2]:
archive_path = '../../data/01/genres.tar.gz'
input_folder = '../../data/01/genres'
output_folder='../../data/01/splits'

if not os.path.isdir(input_folder):
    !tar -zxf $archive_path -C '../../data/01'


### Splitting data into train / val / test folders

In [3]:
from tqdm import tqdm
from glob import glob
import splitfolders


splitfolders.ratio(
    input=input_folder,
    output=output_folder,
    seed=1234, 
    ratio=(.8, .1, .1)
)

Copying files: 1000 files [00:02, 363.65 files/s]


In [4]:
duration_ratio = 30
step_ratio = 30

for split in os.listdir(output_folder):
    print(f"[INFO] Start processing {split} folder...")
    print(f"#########################################")
    df = pd.DataFrame()

    genres = os.listdir(os.path.join(output_folder, split))

    for genre in genres:
        print(f"[INFO] Start processing {genre} files...")
        # get all files of each set
        files = glob(os.path.join(output_folder, split, genre, '**'))
        for file in tqdm(files):
            tmp_datas = []
            data, sr = librosa.load(file)
            # split each file in chunks of {duration_ratio} with a step of {step_ratio}
            for offset in range(0, len(data), sr * step_ratio):
                start = offset
                end = offset + sr * duration_ratio
                chunk = data[start:end]
                if(len(chunk) == sr * duration_ratio):
                    tmp_datas.append(chunk)
            for i, y in enumerate(tmp_datas):
                # extract features from each sample
                data = extract_features(y, sr)
                filename = f"{os.path.basename(file).strip('.wav')}_{i}.wav"
                data.update({
                    "filename": filename,
                    "genre": genre,
                })
                # add features to the dataframe
                df = df.append(data, True)
    dataframe_folder = f'../../data/02/{duration_ratio}-{step_ratio}'
    if not os.path.isdir(dataframe_folder):
        os.makedirs(dataframe_folder)
    # save the dataframe
    df.to_csv(f'{dataframe_folder}/{duration_ratio}-{step_ratio}_{split}_df.csv')


[INFO] Start processing train folder...
#########################################
[INFO] Start processing jazz files...


100%|██████████| 80/80 [05:59<00:00,  4.50s/it]


[INFO] Start processing disco files...


100%|██████████| 80/80 [06:02<00:00,  4.53s/it]


[INFO] Start processing rock files...


100%|██████████| 80/80 [06:05<00:00,  4.57s/it]


[INFO] Start processing country files...


100%|██████████| 80/80 [06:00<00:00,  4.51s/it]


[INFO] Start processing classical files...


100%|██████████| 80/80 [06:02<00:00,  4.53s/it]


[INFO] Start processing metal files...


100%|██████████| 80/80 [06:12<00:00,  4.66s/it]


[INFO] Start processing reggae files...


100%|██████████| 80/80 [05:58<00:00,  4.48s/it]


[INFO] Start processing hiphop files...


 92%|█████████▎| 74/80 [05:33<00:27,  4.60s/it]