In [1]:
import os
import pandas as pd
import librosa
import numpy as np
import time

# Set the input directory
input_dir = "../Dataset/MP3-3seconds"
output_folder = "../GeneratedData/3secondsData"

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Function to extract audio features
def extract_features(file_path):
    y, sr = librosa.load(file_path)
    features = {
        "length": librosa.get_duration(y=y, sr=sr),
        "chroma_stft_mean": np.mean(librosa.feature.chroma_stft(y=y, sr=sr)),
        "chroma_stft_var": np.var(librosa.feature.chroma_stft(y=y, sr=sr)),
        "rms_mean": np.mean(librosa.feature.rms(y=y)),
        "rms_var": np.var(librosa.feature.rms(y=y)),
        "spectral_centroid_mean": np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)),
        "spectral_centroid_var": np.var(librosa.feature.spectral_centroid(y=y, sr=sr)),
        "spectral_bandwidth_mean": np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr)),
        "spectral_bandwidth_var": np.var(librosa.feature.spectral_bandwidth(y=y, sr=sr)),
        "rolloff_mean": np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr)),
        "rolloff_var": np.var(librosa.feature.spectral_rolloff(y=y, sr=sr)),
        "zero_crossing_rate_mean": np.mean(librosa.feature.zero_crossing_rate(y)),
        "zero_crossing_rate_var": np.var(librosa.feature.zero_crossing_rate(y)),
        "harmony_mean": np.mean(librosa.effects.harmonic(y)),
        "harmony_var": np.var(librosa.effects.harmonic(y)),
        "percussive_mean": np.mean(librosa.effects.percussive(y)),
        "percussive_var": np.var(librosa.effects.percussive(y)),
        "tempo": librosa.beat.tempo(y=y, sr=sr)[0]
    }
    
    # Extract MFCCs and add mean and variance of each MFCC
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
    for i in range(1, 21):
        features[f"mfcc{i}_mean"] = np.mean(mfccs[i - 1])
        features[f"mfcc{i}_var"] = np.var(mfccs[i - 1])
    
    return features

# Process each genre folder individually
for genre_folder in os.listdir(input_dir):
    genre_path = os.path.join(input_dir, genre_folder)
    if os.path.isdir(genre_path):
        genre_data = []  # Initialize list to store data for the current genre
        
        # Process each file in the genre folder
        for track_file in os.listdir(genre_path):
            track_path = os.path.join(genre_path, track_file)
            if track_path.endswith(".mp3"):
                try:
                    genre, track_id, _ = track_file.rsplit("-", 2)
                    base_name = os.path.splitext(track_file)[0]
                except ValueError:
                    print(f"Skipping file with unexpected name format: {track_file}")
                    continue
                
                # Log the start time for performance measurement
                start_time = time.time()
                
                # Extract features for the track
                features = extract_features(track_path)
                
                # Calculate and log the time taken
                time_taken = time.time() - start_time
                print(f"Processed {track_file} in {time_taken:.2f} seconds")
                
                # Add file information to features
                features["filename"] = base_name
                features["Genre"] = genre
                features["TrackID"] = track_id
                features["label"] = genre  # Assuming label is the genre
                
                # Append to genre_data list
                genre_data.append(features)
        
        # Create a DataFrame for the current genre and save it as a CSV
        genre_df = pd.DataFrame(genre_data)
        genre_output_file = os.path.join(output_folder, f"{genre_folder}_dataset.csv")
        genre_df.to_csv(genre_output_file, index=False)
        print(f"Saved dataset for genre '{genre_folder}' to {genre_output_file}")

# Combine all genre-specific CSV files into one final dataset
all_dataframes = []
for csv_file in os.listdir(output_folder):
    if csv_file.endswith("_dataset.csv"):
        csv_path = os.path.join(output_folder, csv_file)
        all_dataframes.append(pd.read_csv(csv_path))

# Concatenate all dataframes and save as a single CSV
final_df = pd.concat(all_dataframes, ignore_index=True)
final_output_file = os.path.join(output_folder, "combined_audio_features_dataset.csv")
final_df.to_csv(final_output_file, index=False)

print(f"Combined dataset saved to {final_output_file}")


	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  "tempo": librosa.beat.tempo(y=y, sr=sr)[0]


Processed New Age-TRGETDV128E0796895-5.mp3 in 1.75 seconds
Processed New Age-TRGYSPV128F146953B-9.mp3 in 0.40 seconds
Processed New Age-TRCEQQA128F92F48B5-4.mp3 in 0.41 seconds
Processed New Age-TRPZFRA128F146956E-8.mp3 in 0.41 seconds
Processed New Age-TRFZASN128F93045F8-10.mp3 in 0.43 seconds
Processed New Age-TRBLBSD128F1490EF0-2.mp3 in 0.39 seconds
Processed New Age-TROSQYZ128F92DACFC-2.mp3 in 0.39 seconds
Processed New Age-TRNVLKM128F4285E1C-3.mp3 in 0.39 seconds
Processed New Age-TROPHST128F42ABAF9-1.mp3 in 0.38 seconds
Processed New Age-TRFORFZ128F9311116-1.mp3 in 0.39 seconds
Processed New Age-TRFDVAM128F1453AD9-8.mp3 in 0.38 seconds
Processed New Age-TRTUMOU128F930344E-10.mp3 in 0.40 seconds
Processed New Age-TRTUMOU128F930344E-1.mp3 in 0.39 seconds
Processed New Age-TRDAVOG128F93049AB-1.mp3 in 0.40 seconds
Processed New Age-TRYOTWE128F421538E-1.mp3 in 0.46 seconds
Processed New Age-TRDZYDU128F93071CD-1.mp3 in 0.41 seconds
Processed New Age-TRZOAYC128F92F1B4B-3.mp3 in 0.41 sec

  return pitch_tuning(


Processed New Age-TRMLAVO128F1469D94-9.mp3 in 0.42 seconds
Processed New Age-TRIXEIM128F92F1CAE-3.mp3 in 0.40 seconds
Processed New Age-TRDCVAI128F1473EC6-3.mp3 in 0.42 seconds
Processed New Age-TRPKYXC12903CB6B24-3.mp3 in 0.41 seconds
Processed New Age-TRQJNXY128F1469546-4.mp3 in 0.41 seconds
Processed New Age-TRFNBWT128F4291999-1.mp3 in 0.39 seconds
Processed New Age-TRFZZIG128E078AF7E-5.mp3 in 0.39 seconds
Processed New Age-TRADJAG128F93051C2-2.mp3 in 0.38 seconds
Processed New Age-TRJUSRX128F422025C-3.mp3 in 0.41 seconds
Processed New Age-TRXFWIW128F92EC174-6.mp3 in 0.41 seconds
Processed New Age-TRTXAUD128F4264530-9.mp3 in 0.40 seconds
Processed New Age-TRFZASN128F93045F8-7.mp3 in 0.40 seconds
Processed New Age-TRNIXMN128F4295F70-2.mp3 in 0.40 seconds
Processed New Age-TRUVWIP128F422E647-8.mp3 in 0.41 seconds
Processed New Age-TRXBHXO128F1458F38-8.mp3 in 0.38 seconds
Processed New Age-TRFZASN128F93045F8-5.mp3 in 0.41 seconds
Processed New Age-TRTQYNR128F14881D7-8.mp3 in 0.40 secon

In [7]:
import pandas as pd
import os

# List of genres to include
genres_to_include = ['Blues', 'Country', 'Electronic', 'Folk', 'Jazz', 'Latin', 'Metal', 'Pop', 'Rap', 'Reggae', 'Rock']

# Set the folder where the CSV files are saved
output_folder = "../GeneratedData/3secondsData"

# Initialize an empty list to store DataFrames
dataframes_to_combine = []

# Loop through the genres and read their corresponding CSV files
for genre in genres_to_include:
    genre_csv_file = os.path.join(output_folder, f"{genre}_dataset.csv")
    if os.path.exists(genre_csv_file):
        genre_df = pd.read_csv(genre_csv_file)
        dataframes_to_combine.append(genre_df)
    else:
        print(f"File for genre '{genre}' not found, skipping.")

# Combine all the DataFrames into one
combined_df = pd.concat(dataframes_to_combine, ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_output_file = os.path.join(output_folder, "BluesCountryElectronicFolkJazzLatinMetalPopRapReggaeRock_Dataset.csv")
combined_df.to_csv(combined_output_file, index=False)

print(f"Combined dataset saved to {combined_output_file}")


Combined dataset saved to ../GeneratedData/3secondsData/BluesCountryElectronicFolkJazzLatinMetalPopRapReggaeRock_Dataset.csv
