In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import numpy as np
import librosa
import pandas as pd

In [None]:
sr = 16000
hop_length = 512
n_bins = 84
bins_per_octave = 12
fmin = librosa.note_to_hz('C1')

In [None]:
def energy_entropy(spectrogram):
    eps = 1e-10
    energy = np.sum(spectrogram**2, axis=0)
    if np.sum(energy) == 0:
        return 0
    energy /= np.sum(energy) + eps
    entropy = -np.sum(energy * np.log2(energy + eps))
    return entropy

def extract_features(spectrogram_db):
    if spectrogram_db.size == 0:
        print("Warning: Empty spectrogram detected, skipping feature extraction.")
        return None

    spectrogram_power = librosa.db_to_power(spectrogram_db)

    spectral_centroid = librosa.feature.spectral_centroid(S=spectrogram_power).mean()
    spectral_bandwidth = librosa.feature.spectral_bandwidth(S=spectrogram_power).mean()
    spectral_rolloff = librosa.feature.spectral_rolloff(S=spectrogram_power).mean()
    spectral_flatness = librosa.feature.spectral_flatness(S=spectrogram_power).mean()

    rms_energy = np.sqrt(np.mean(spectrogram_power**2, axis=1)).mean()
    entropy_val = energy_entropy(spectrogram_power)

    mean_spectral = np.mean(spectrogram_power)
    std_spectral = np.std(spectrogram_power)

    feature_dict = {
        "spectral_centroid": spectral_centroid,
        "spectral_bandwidth": spectral_bandwidth,
        "spectral_rolloff": spectral_rolloff,
        "spectral_flatness": spectral_flatness,
        "rms_energy": rms_energy,
        "energy_entropy": entropy_val,
        "mean": mean_spectral,
        "std_dev": std_spectral,
    }

    return feature_dict

def process_folder(main_folder, save_csv_path):
    subband_ranges = {
        "subband_1": (0, 11),
        "subband_2": (12, 23),
        "subband_3": (24, 35),
        "subband_4": (36, 47),
        "subband_5": (48, 59),
        "subband_6": (60, 71),
        "subband_7": (72, 83)
    }

    all_features_list = []

    for root, _, files in os.walk(main_folder):
        set_folder = os.path.basename(root)
        for npy_file in files:
            if npy_file.endswith(".npy"):
                file_path = os.path.join(root, npy_file)
                print(f"Processing: {file_path}")
                class_label = 1 if "forged" in npy_file.lower() else 0
                genre_label = get_genre_from_filename(npy_file)

                log_cqt = np.load(file_path)
                cqt_frequencies = librosa.cqt_frequencies(n_bins=n_bins, fmin=fmin, bins_per_octave=bins_per_octave)
                all_features = {"file_name": npy_file, "genre": genre_label}

                for subband_name, (start_bin, end_bin) in subband_ranges.items():
                    if start_bin >= log_cqt.shape[0]:
                        print(f"Skipping {subband_name}: Start bin {start_bin} out of range for {npy_file}.")
                        continue
                    end_bin = min(end_bin, log_cqt.shape[0] - 1)
                    subband_cqt = log_cqt[start_bin:end_bin + 1, :]

                    if subband_cqt.size == 0:
                        print(f"Skipping {subband_name}: Empty subband detected for {npy_file}.")
                        continue

                    feature_dict = extract_features(subband_cqt)

                    try:
                        subband_cqt_mag = librosa.db_to_amplitude(subband_cqt)
                        chroma = librosa.feature.chroma_cqt(C=subband_cqt_mag, sr=sr)
                        chroma_mean = np.mean(chroma, axis=1)
                        chroma_summary = np.mean(chroma_mean)  # ONE feature
                        if feature_dict:
                            feature_dict["chroma"] = chroma_summary
                        else:
                            feature_dict = {"chroma": chroma_summary}
                    except Exception as e:
                        print(f"Chroma extraction failed for {subband_name} in {npy_file}: {e}")
                        if feature_dict:
                            feature_dict["chroma"] = 0.0  # fallback
                        else:
                            feature_dict = {"chroma": 0.0}

                    if feature_dict:
                        for key, value in feature_dict.items():
                            all_features[f"{subband_name}_{key}"] = value

                if all_features:
                    all_features["class"] = class_label
                    all_features_list.append(all_features)

    if all_features_list:
        feature_df = pd.DataFrame(all_features_list)
        feature_df.to_csv(save_csv_path, index=False)
        print(f"Features saved in {save_csv_path}")
    else:
        print("No valid features extracted.")

***For Original Tracks***

In [None]:
def get_genre_from_filename(filename):
    genre_map = {
        "cl": "classical",
        "c": "country",
        "e": "electronic",
        "f": "folk",
        "h": "hip hop",
        "j": "jazz",
        "o": "old historic",
        "p": "pop",
        "r": "rock",
        "s": "soul RnB",
    }
    # Split by "."
    parts = filename.split(".")
    if parts:
        # Extract alphabetic characters from the first part
        prefix = "".join(filter(str.isalpha, parts[0]))
        if prefix in genre_map:
            return genre_map[prefix]
    return None

In [None]:
main_folder = "/content/drive/MyDrive/CQT_Spectrograms/original/npy_files"
save_csv_path = "/content/drive/MyDrive/subband_features/original_tracks_features.csv"

process_folder(main_folder, save_csv_path)

Processing: /content/drive/MyDrive/CQT_Spectrograms/original/npy_files/o201.npy
Processing: /content/drive/MyDrive/CQT_Spectrograms/original/npy_files/o202.npy
Processing: /content/drive/MyDrive/CQT_Spectrograms/original/npy_files/o203.npy
Processing: /content/drive/MyDrive/CQT_Spectrograms/original/npy_files/o204.npy
Processing: /content/drive/MyDrive/CQT_Spectrograms/original/npy_files/o205.npy
Processing: /content/drive/MyDrive/CQT_Spectrograms/original/npy_files/o206.npy
Processing: /content/drive/MyDrive/CQT_Spectrograms/original/npy_files/o207.npy
Processing: /content/drive/MyDrive/CQT_Spectrograms/original/npy_files/o208.npy
Processing: /content/drive/MyDrive/CQT_Spectrograms/original/npy_files/o209.npy
Processing: /content/drive/MyDrive/CQT_Spectrograms/original/npy_files/o210.npy
Processing: /content/drive/MyDrive/CQT_Spectrograms/original/npy_files/o211.npy
Processing: /content/drive/MyDrive/CQT_Spectrograms/original/npy_files/o212.npy
Processing: /content/drive/MyDrive/CQT_S

***For Forged Tracks***

In [None]:
def get_genre_from_filename(filename):
    genre_map = {
        "cl": "classical",
        "c": "country",
        "e": "electronic",
        "f": "folk",
        "h": "hip hop",
        "j": "jazz",
        "o": "old historic",
        "p": "pop",
        "r": "rock",
        "s": "soul RnB",
    }
    # First split by "."
    parts = filename.split(".")
    if parts:
        # Take the first part and split by "_"
        sub_parts = parts[0].split("_")
        if sub_parts:
            prefix = "".join(filter(str.isalpha, sub_parts[0]))
            if prefix in genre_map:
                return genre_map[prefix]
    return None

In [None]:
main_folder = "/content/drive/MyDrive/CQT_Spectrograms/forged/npy_files"
save_csv_path = "/content/drive/MyDrive/subband_features/forged_tracks_features.csv"

process_folder(main_folder, save_csv_path)

Processing: /content/drive/MyDrive/CQT_Spectrograms/forged/npy_files/j224_forged.npy
Processing: /content/drive/MyDrive/CQT_Spectrograms/forged/npy_files/e231_forged.npy
Processing: /content/drive/MyDrive/CQT_Spectrograms/forged/npy_files/r022_forged.npy
Processing: /content/drive/MyDrive/CQT_Spectrograms/forged/npy_files/h155_forged.npy
Processing: /content/drive/MyDrive/CQT_Spectrograms/forged/npy_files/p205_forged.npy
Processing: /content/drive/MyDrive/CQT_Spectrograms/forged/npy_files/f111_forged.npy
Processing: /content/drive/MyDrive/CQT_Spectrograms/forged/npy_files/h132_forged.npy
Processing: /content/drive/MyDrive/CQT_Spectrograms/forged/npy_files/h080_forged.npy
Processing: /content/drive/MyDrive/CQT_Spectrograms/forged/npy_files/c152_forged.npy
Processing: /content/drive/MyDrive/CQT_Spectrograms/forged/npy_files/s050_forged.npy
Processing: /content/drive/MyDrive/CQT_Spectrograms/forged/npy_files/h009_forged.npy
Processing: /content/drive/MyDrive/CQT_Spectrograms/forged/npy_fi

***Combine CSVs (Original + Forged)***

In [None]:
import pandas as pd


df1 = pd.read_csv('/content/drive/MyDrive/subband_features/original_tracks_features.csv')
df2 = pd.read_csv('/content/drive/MyDrive/subband_features/forged_tracks_features.csv')


merged_df = pd.concat([df1, df2], ignore_index=True)

merged_df.to_csv('/content/drive/MyDrive/subband_features/all_tracks_features.csv', index=False)

print("Saved as 'all_tracks_features.csv'.")


Saved as 'all_tracks_features.csv'.


***MinMax Scaling***

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Define the directory in Google Drive where your files will be located
drive_path = '/content/drive/MyDrive/subband_features'  # Adjust this path if needed

def split_and_scale_csv(input_csv_file, output_csv_prefix="scaled_output"):
    """
    Splits a CSV file into two CSV files based on the 'class' column (0 or 1),
    shuffles the data, and then applies Min-Max scaling to the numerical
    columns of each resulting CSV separately. Reads from and writes to Google Drive.

    Args:
        input_csv_file (str): Name of the input CSV file in Google Drive.
        output_csv_prefix (str): Prefix for the output CSV file names in Google Drive.
                                   The output files will be named as
                                   f"{output_csv_prefix}_class_0.csv" and
                                   f"{output_csv_prefix}_class_1.csv".
    """
    input_file_path = os.path.join(drive_path, input_csv_file)

    try:
        # Read the CSV file into a pandas DataFrame from Google Drive
        df = pd.read_csv(input_file_path)

        # Check if the 'class' column exists
        if 'class' not in df.columns:
            print(f"Error: 'class' column not found in the input CSV file: {input_csv_file}")
            return

        # Shuffle the DataFrame
        df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

        # Separate the DataFrame based on the 'class' column
        df_class_0 = df_shuffled[df_shuffled['class'] == 0].copy()
        df_class_1 = df_shuffled[df_shuffled['class'] == 1].copy()

        # Identify numerical columns (excluding the 'class' column)
        numerical_cols_0 = df_class_0.select_dtypes(include=['number']).columns.tolist()
        if 'class' in numerical_cols_0:
            numerical_cols_0.remove('class')

        numerical_cols_1 = df_class_1.select_dtypes(include=['number']).columns.tolist()
        if 'class' in numerical_cols_1:
            numerical_cols_1.remove('class')

        # Apply Min-Max scaling to class 0 data
        if numerical_cols_0:
            scaler_0 = MinMaxScaler()
            df_class_0[numerical_cols_0] = scaler_0.fit_transform(df_class_0[numerical_cols_0])
            print(f"Min-Max scaling applied to numerical columns of class 0.")
        else:
            print("No numerical columns found (excluding 'class') for class 0 to scale.")

        # Apply Min-Max scaling to class 1 data
        if numerical_cols_1:
            scaler_1 = MinMaxScaler()
            df_class_1[numerical_cols_1] = scaler_1.fit_transform(df_class_1[numerical_cols_1])
            print(f"Min-Max scaling applied to numerical columns of class 1.")
        else:
            print("No numerical columns found (excluding 'class') for class 1 to scale.")

        # Define output file paths in Google Drive
        output_file_path_0 = os.path.join(drive_path, f"{output_csv_prefix}_class_0.csv")
        output_file_path_1 = os.path.join(drive_path, f"{output_csv_prefix}_class_1.csv")

        # Save the scaled DataFrames to new CSV files in Google Drive
        df_class_0.to_csv(output_file_path_0, index=False)
        df_class_1.to_csv(output_file_path_1, index=False)

        print(f"\nSuccessfully split and scaled '{input_csv_file}' into:")
        print(f"- '{output_csv_prefix}_class_0.csv' (containing scaled rows with class 0 in Google Drive)")
        print(f"- '{output_csv_prefix}_class_1.csv' (containing scaled rows with class 1 in Google Drive)")

    except FileNotFoundError:
        print(f"Error: Input file '{input_csv_file}' not found in Google Drive at '{drive_path}'.")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    # Example usage: Replace 'your_input_file.csv' with the actual name
    # of your CSV file in your Google Drive root directory (or adjust drive_path).
    input_file = "all_tracks_features.csv"
    split_and_scale_csv(input_file)

Min-Max scaling applied to numerical columns of class 0.
Min-Max scaling applied to numerical columns of class 1.

Successfully split and scaled 'all_tracks_features.csv' into:
- 'scaled_output_class_0.csv' (containing scaled rows with class 0 in Google Drive)
- 'scaled_output_class_1.csv' (containing scaled rows with class 1 in Google Drive)


***Split by genre***

In [None]:
import pandas as pd
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the directory in Google Drive where your files are located
drive_path = '/content/drive/MyDrive/subband_features'  # Adjust this path if needed

genres = ['classical', 'country', 'electronic', 'folk', 'hip hop', 'jazz', 'old historic', 'pop', 'rock', 'soul RnB']

def split_by_genre(input_csv_file):
    '''Splits a CSV file by genre and saves each genre to a separate CSV file in Google Drive.'''
    input_file_path = os.path.join(drive_path, input_csv_file)

    try:
        df = pd.read_csv(input_file_path)

        if 'genre' not in df.columns:
            print(f"Error: 'genre' column not found in {input_csv_file}")
            return

        for genre in genres:
            genre_df = df[df['genre'] == genre]
            if not genre_df.empty:  # Only save if there are rows for this genre
                output_file_name = f"{os.path.splitext(input_csv_file)[0]}_{genre.replace(' ', '_')}.csv" # Replace spaces in genre name with underscores
                output_file_path = os.path.join(drive_path, output_file_name)
                genre_df.to_csv(output_file_path, index=False)
                print(f"Saved {genre} data to {output_file_name}")
            else:
                print(f"No data found for genre '{genre}' in {input_csv_file}")

    except FileNotFoundError:
        print(f"Error: Input file '{input_csv_file}' not found in Google Drive at '{drive_path}'.")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    # Example usage:  Make sure 'class_0.csv' and 'class_1.csv' are in your Google Drive root (or adjust drive_path)
    split_by_genre('scaled_output_class_0.csv')
    split_by_genre('scaled_output_class_1.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved classical data to scaled_output_class_0_classical.csv
Saved country data to scaled_output_class_0_country.csv
Saved electronic data to scaled_output_class_0_electronic.csv
Saved folk data to scaled_output_class_0_folk.csv
Saved hip hop data to scaled_output_class_0_hip_hop.csv
Saved jazz data to scaled_output_class_0_jazz.csv
Saved old historic data to scaled_output_class_0_old_historic.csv
Saved pop data to scaled_output_class_0_pop.csv
Saved rock data to scaled_output_class_0_rock.csv
Saved soul RnB data to scaled_output_class_0_soul_RnB.csv
Saved classical data to scaled_output_class_1_classical.csv
Saved country data to scaled_output_class_1_country.csv
Saved electronic data to scaled_output_class_1_electronic.csv
Saved folk data to scaled_output_class_1_folk.csv
Saved hip hop data to scaled_output_class_1_hip_hop.csv
Saved jazz data to scaled_outpu

***Train/Val/Test Split***

In [None]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from google.colab import drive
import numpy as np

# Mount Google Drive
drive.mount('/content/drive')

# Define the directory in Google Drive
drive_path = '/content/drive/MyDrive/subband_features'  # Adjust if needed

def split_data_with_distribution_control(input_files, drive_path, test_size=0.15, val_size=0.15, random_state=42):
    """
    Splits multiple CSV files (assumed to represent different classes and genres) into training,
    validation, and test sets, ensuring that the class and genre distributions are maintained
    across the resulting sets.  Each input CSV is assumed to have 250 rows.

    Args:
        input_files (list): List of input CSV file names in Google Drive.
        drive_path (str):  Path to the directory in Google Drive.
        test_size (float): Proportion of the data for the test set.
        val_size (float):  Proportion of the data for the validation set.
        random_state (int): Controls data shuffling for reproducibility.
    """
    # Load all dataframes
    dfs = []
    for file in input_files:
        file_path = os.path.join(drive_path, file)
        try:
            df = pd.read_csv(file_path)
            if len(df) != 250:
                print(f"Error: {file} does not have 250 rows. Skipping.")
                return
            dfs.append(df)
        except FileNotFoundError:
            print(f"Error: File not found: {file_path}")
            return

    # Combine all dataframes
    combined_df = pd.concat(dfs, ignore_index=True)

    # Calculate the number of samples for test and validation sets
    total_samples = len(combined_df)
    test_samples = int(total_samples * test_size)
    val_samples = int(total_samples * val_size)

    # Create empty dataframes for the splits
    train_df = pd.DataFrame()
    val_df = pd.DataFrame()
    test_df = pd.DataFrame()

    # Get unique classes and genres
    classes = combined_df['class'].unique()
    genres = combined_df['genre'].unique()

    for cls in classes:
        for genre in genres:
            # Get the data for the current class and genre
            subset_df = combined_df[(combined_df['class'] == cls) & (combined_df['genre'] == genre)]
            if subset_df.empty:
                continue  # Skip if no data for this combination

            # Split the subset into train, val, and test sets
            train_subset, temp_subset = train_test_split(subset_df, test_size=(test_size + val_size), random_state=random_state)
            val_subset, test_subset = train_test_split(temp_subset, test_size=test_size / (test_size + val_size), random_state=random_state)

            # Add the split subsets to the main dataframes
            train_df = pd.concat([train_df, train_subset], ignore_index=True)
            val_df = pd.concat([val_df, val_subset], ignore_index=True)
            test_df = pd.concat([test_df, test_subset], ignore_index=True)

    # Check if the size of the dataframes is correct
    if (len(test_df) != test_samples or len(val_df) != val_samples):
        print(f"Error: Size of the final dataframes is incorrect.  Expected Test:{test_samples}, Val:{val_samples}, "
              f"Actual Test: {len(test_df)}, Val: {len(val_df)}.")

    # Save the dataframes to CSV files
    train_file_path = os.path.join(drive_path, 'train_set.csv')
    val_file_path = os.path.join(drive_path, 'val_set.csv')
    test_file_path = os.path.join(drive_path, 'test_set.csv')

    train_df.to_csv(train_file_path, index=False)
    val_df.to_csv(val_file_path, index=False)
    test_df.to_csv(test_file_path, index=False)

    print(f"Saved combined train, val, and test sets to:\n{train_file_path}\n{val_file_path}\n{test_file_path}")



if __name__ == "__main__":
    # Example Usage
    input_files = [
        'scaled_output_class_0_classical.csv', 'scaled_output_class_0_country.csv', 'scaled_output_class_0_electronic.csv', 'scaled_output_class_0_folk.csv',
        'scaled_output_class_0_hip_hop.csv', 'scaled_output_class_0_jazz.csv', 'scaled_output_class_0_old_historic.csv', 'scaled_output_class_0_pop.csv',
        'scaled_output_class_0_rock.csv', 'scaled_output_class_0_soul_RnB.csv',
        'scaled_output_class_1_classical.csv', 'scaled_output_class_1_country.csv', 'scaled_output_class_1_electronic.csv', 'scaled_output_class_1_folk.csv',
        'scaled_output_class_1_hip_hop.csv', 'scaled_output_class_1_jazz.csv', 'scaled_output_class_1_old_historic.csv', 'scaled_output_class_1_pop.csv',
        'scaled_output_class_1_rock.csv', 'scaled_output_class_1_soul_RnB.csv'
    ]

    split_data_with_distribution_control(input_files, drive_path)


Mounted at /content/drive
Error: Size of the final dataframes is incorrect.  Expected Test:750, Val:750, Actual Test: 760, Val: 740.
Saved combined train, val, and test sets to:
/content/drive/MyDrive/subband_features/train_set.csv
/content/drive/MyDrive/subband_features/val_set.csv
/content/drive/MyDrive/subband_features/test_set.csv


***Distribution in Train/Val/Test***

In [None]:
import pandas as pd
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the directory in Google Drive where your files are located
drive_path = '/content/drive/MyDrive/subband_features'  # Adjust this path if needed

def print_record_counts(csv_file, drive_path):
    file_path = os.path.join(drive_path, csv_file)

    try:
        # Read the CSV file into a pandas DataFrame from Google Drive
        df = pd.read_csv(file_path)

        # Print the total number of records
        total_records = len(df)
        print(f"\nFile: {csv_file}")
        print(f"Total Records: {total_records}")

        # Print the number of records for each class and genre combination
        print("\nRecords per Class and Genre Combination:")
        combined_counts = df.groupby(['class', 'genre']).size()
        for (cls, genre), count in combined_counts.items():
            print(f"- Class {cls}, Genre {genre}: {count}")

    except FileNotFoundError:
        print(f"Error: File not found: {file_path}")
    except KeyError as e:
        print(f"Error: Required column not found in CSV file: {csv_file}.  Missing column: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    # Example usage: Specify the names of your train, validation, and test CSV files.
    csv_files = ['train_set.csv', 'val_set.csv', 'test_set.csv']  # Adjust these names if needed

    for csv_file in csv_files:
        print_record_counts(csv_file, drive_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

File: train_set.csv
Total Records: 3500

Records per Class and Genre Combination:
- Class 0, Genre classical: 175
- Class 0, Genre country: 175
- Class 0, Genre electronic: 175
- Class 0, Genre folk: 175
- Class 0, Genre hip hop: 175
- Class 0, Genre jazz: 175
- Class 0, Genre old historic: 175
- Class 0, Genre pop: 175
- Class 0, Genre rock: 175
- Class 0, Genre soul RnB: 175
- Class 1, Genre classical: 175
- Class 1, Genre country: 175
- Class 1, Genre electronic: 175
- Class 1, Genre folk: 175
- Class 1, Genre hip hop: 175
- Class 1, Genre jazz: 175
- Class 1, Genre old historic: 175
- Class 1, Genre pop: 175
- Class 1, Genre rock: 175
- Class 1, Genre soul RnB: 175

File: val_set.csv
Total Records: 740

Records per Class and Genre Combination:
- Class 0, Genre classical: 37
- Class 0, Genre country: 37
- Class 0, Genre electronic: 37
- Class 0, Genre fol