## PHASE 1:

In [2]:
import os
import librosa
import numpy as np
import pandas as pd

# Define the path to the fma_small dataset
# fma_small_path = "/home/mohid/Desktop/BIG_DATA_PROJECT/fma_small/"
# /media/abdullah/Laxer_SSD/Semester_6/Big_Data/Project/fma_medium/
fma_medium_path = "/home/abdullah/Documents/Big Data Project/"
# Define a function to extract features from audio files
def extract_features(audio_path, max_length=1000):
    print(f"Reading audio file: {audio_path}")
    # Load audio file
    try:
        y, sr = librosa.load(audio_path)
    except Exception as e:
        print(f"Error reading {audio_path}: {e}")
        return None
    
    # Extract features
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
    
    # Pad or truncate features to ensure a fixed length
    mfcc = librosa.util.fix_length(mfcc, size=max_length, axis=1)
    spectral_centroid = librosa.util.fix_length(spectral_centroid, size=max_length)
    zero_crossing_rate = librosa.util.fix_length(zero_crossing_rate, size=max_length)
    
    # Concatenate features into a single array
    features = np.concatenate([mfcc, spectral_centroid, zero_crossing_rate], axis=0)
    
    return features.flatten()  # Flatten the feature array to ensure a consistent shape

# Initialize empty lists to store features and track IDs
features_list = []
track_ids = []

# Initialize counter for tracking the number of files processed
file_count = 0

# Initialize a DataFrame to store all features
all_features_df = None

# Iterate through each folder in fma_small
for folder in os.listdir(fma_medium_path):
    if os.path.isdir(os.path.join(fma_medium_path, folder)):
        for filename in os.listdir(os.path.join(fma_medium_path, folder)):
            if filename.endswith(".mp3"):
                # Construct the path to the audio file
                audio_path = os.path.join(fma_medium_path, folder, filename)
                
                # Extract features from the audio file
                audio_features = extract_features(audio_path)
                if audio_features is not None:
                    # Store features and track ID
                    features_list.append(audio_features)
                    track_ids.append(filename.split('.')[0])  # Assuming the filename is in the format 'trackID.mp3'
                    
                    # Increment file count
                    file_count += 1
                    
                    # Save features to DataFrame
                    if file_count % 50 == 0 or file_count == len(os.listdir(fma_medium_path)):
                        # Convert features list to DataFrame
                        features_df = pd.DataFrame(data=features_list, columns=[f"feature_{i}" for i in range(features_list[0].shape[0])])
                        features_df['track_id'] = track_ids
                        
                        # Concatenate with previous features
                        if all_features_df is None:
                            all_features_df = features_df
                        else:
                            all_features_df = pd.concat([all_features_df, features_df], ignore_index=True)
                        
                        # Reset lists after saving
                        features_list = []
                        track_ids = []
                        
                        # Save all features to a CSV file
                        all_features_df.to_csv('audio_features_partial.csv', index=False)
                        print(f"Partial audio features saved. Total processed: {file_count}")

print("Audio features extracted and saved to a partial CSV file.")

Audio features extracted and saved to a partial CSV file.
