In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import itertools
import os

# Define the genres to be used in the combinations (customizable)
selected_genres = ['Rap', 'Blues', 'Rock', 'Electronic', 'Reggae', 'Country', 'Metal', 'Pop', 'Jazz']

# Function to load the dataset for a given set of genres and perform evaluation
def evaluate_xgb_with_genres(genres, num_estimators, learning_rate):
    # Create the file name based on the genre combination
    file_names = [f'../GeneratedData/3secondsData/{genre}_dataset.csv' for genre in genres]
    
    # Load the corresponding datasets for the selected genres
    data_frames = []
    for file_name in file_names:
        if os.path.exists(file_name):
            data_frames.append(pd.read_csv(file_name))
        else:
            print(f"Warning: {file_name} not found.")
    
    if not data_frames:
        print(f"No datasets found for {genres}. Skipping combination.")
        return None

    # Concatenate all genre datasets into a single dataframe
    data = pd.concat(data_frames, ignore_index=True)
    
    # Separate features and target (label)
    y = data['label']
    X = data.drop(columns=['label', 'TrackID', 'Genre', 'filename'])

    # Normalize X
    from sklearn.preprocessing import MinMaxScaler
    min_max_scaler = MinMaxScaler()
    np_scaled = min_max_scaler.fit_transform(X)
    X = pd.DataFrame(np_scaled, columns=X.columns)

    # Split the data (20-30% testing split)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    # Encode the labels
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    y_test_encoded = le.transform(y_test)

    # Define the XGBoost model with the given hyperparameters
    xgb = XGBClassifier(n_estimators=num_estimators, learning_rate=learning_rate, random_state=42)

    # Train the model
    xgb.fit(X_train, y_train_encoded)

    # Predict and evaluate accuracy
    preds = xgb.predict(X_test)
    accuracy = accuracy_score(y_test_encoded, preds)

    # Return the results
    return accuracy, genres, num_estimators, learning_rate

# Generate all combinations of the selected genres with at least 8 genres in each combination
genre_combinations = [list(combo) for i in range(8, len(selected_genres) + 1) for combo in itertools.combinations(selected_genres, i)]

# Combinations of parameters to test
num_estimators_list = [500, 1000, 1500, 5000]
learning_rate_list = [ 0.1, 0.05, 0.01]

# List to store results
results = []

# Loop through each combination of genres, number of estimators, and learning rate
for genres in genre_combinations:
    for num_estimators in num_estimators_list:
        for learning_rate in learning_rate_list:
            result = evaluate_xgb_with_genres(genres, num_estimators, learning_rate)
            if result:
                accuracy, genres_used, n_estimators, lr = result
                results.append({
                    'Genres': ', '.join(genres_used),
                    'Number of Estimators': n_estimators,
                    'Learning Rate': lr,
                    'Accuracy': accuracy
                })
                print(f"Genres: {', '.join(genres_used)} | "
                      f"Number of Estimators: {n_estimators} | "
                      f"Learning Rate: {lr} | "
                      f"Accuracy: {accuracy}")

# Save the results to a DataFrame
results_df = pd.DataFrame(results)

# Generate a short form of genre names (first two letters) to append to the file name
genre_abbr = ''.join([g[:2] for g in selected_genres])

# Write results to a CSV file with genre abbreviations in the file name
output_file = f'../GeneratedData/Model_Results_XGB_{genre_abbr}_Combinations.csv'
results_df.to_csv(output_file, index=False)

print(f'Model evaluation complete. Results saved to {output_file}')


Genres: Rap, Blues, Rock, Electronic, Reggae, Country, Metal, Pop | Number of Estimators: 500 | Learning Rate: 0.01 | Accuracy: 0.6328671328671329
Genres: Rap, Blues, Rock, Electronic, Reggae, Country, Metal, Pop | Number of Estimators: 500 | Learning Rate: 0.05 | Accuracy: 0.7437562437562437
Genres: Rap, Blues, Rock, Electronic, Reggae, Country, Metal, Pop | Number of Estimators: 500 | Learning Rate: 0.1 | Accuracy: 0.7537462537462537
Genres: Rap, Blues, Rock, Electronic, Reggae, Country, Metal, Pop | Number of Estimators: 500 | Learning Rate: 0.025 | Accuracy: 0.7047952047952047
Genres: Rap, Blues, Rock, Electronic, Reggae, Country, Metal, Pop | Number of Estimators: 1000 | Learning Rate: 0.01 | Accuracy: 0.6818181818181818
Genres: Rap, Blues, Rock, Electronic, Reggae, Country, Metal, Pop | Number of Estimators: 1000 | Learning Rate: 0.05 | Accuracy: 0.7577422577422578
Genres: Rap, Blues, Rock, Electronic, Reggae, Country, Metal, Pop | Number of Estimators: 1000 | Learning Rate: 0.1 

KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import os

# Define the fixed set of genres to use
selected_genres = ['Blues', 'Country', 'Jazz', 'Metal', 'Pop', 'Reggae', 'Rock']

# Function to load the dataset for a given set of genres and perform evaluation
def evaluate_xgb_with_genres(genres, num_estimators, learning_rate):
    # Create the file name based on the genre combination
    file_names = [f'../GeneratedData/3secondsData/{genre}_dataset.csv' for genre in genres]
    
    # Load the corresponding datasets for the selected genres
    data_frames = []
    for file_name in file_names:
        if os.path.exists(file_name):
            data_frames.append(pd.read_csv(file_name))
        else:
            print(f"Warning: {file_name} not found.")
    
    if not data_frames:
        print(f"No datasets found for {genres}. Skipping combination.")
        return None

    # Concatenate all genre datasets into a single dataframe
    data = pd.concat(data_frames, ignore_index=True)
    
    # Separate features and target (label)
    y = data['label']
    X = data.drop(columns=['label', 'TrackID', 'Genre', 'filename'])

    # Normalize X
    from sklearn.preprocessing import MinMaxScaler
    min_max_scaler = MinMaxScaler()
    np_scaled = min_max_scaler.fit_transform(X)
    X = pd.DataFrame(np_scaled, columns=X.columns)

    # Split the data (20-30% testing split)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    # Encode the labels
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    y_test_encoded = le.transform(y_test)

    # Define the XGBoost model with the given hyperparameters
    xgb = XGBClassifier(n_estimators=num_estimators, learning_rate=learning_rate, random_state=42)

    # Train the model
    xgb.fit(X_train, y_train_encoded)

    # Predict and evaluate accuracy
    preds = xgb.predict(X_test)
    accuracy = accuracy_score(y_test_encoded, preds)

    # Return the results
    return accuracy, genres, num_estimators, learning_rate

# Define hyperparameters to test
num_estimators_list = [500, 1000, 1500, 5000]
learning_rate_list = [0.1, 0.05, 0.01]

# List to store results
results = []

# Loop through the parameters for the single set of genres
for num_estimators in num_estimators_list:
    for learning_rate in learning_rate_list:
        result = evaluate_xgb_with_genres(selected_genres, num_estimators, learning_rate)
        if result:
            accuracy, genres_used, n_estimators, lr = result
            results.append({
                'Genres': ', '.join(genres_used),
                'Number of Estimators': n_estimators,
                'Learning Rate': lr,
                'Accuracy': accuracy
            })
            print(f"Genres: {', '.join(genres_used)} | "
                  f"Number of Estimators: {n_estimators} | "
                  f"Learning Rate: {lr} | "
                  f"Accuracy: {accuracy}")

# Save the results to a DataFrame
results_df = pd.DataFrame(results)

# Generate a short form of genre names (first two letters) to append to the file name
genre_abbr = ''.join([g[:2] for g in selected_genres])

# Write results to a CSV file with genre abbreviations in the file name
output_file = f'../GeneratedData/Model_Results_XGB_{genre_abbr}_25p.csv'
results_df.to_csv(output_file, index=False)

print(f'Model evaluation complete. Results saved to {output_file}')


Genres: Blues, Country, Jazz, Metal, Pop, Reggae, Rock | Number of Estimators: 500 | Learning Rate: 0.1 | Accuracy: 0.723465016658734
Genres: Blues, Country, Jazz, Metal, Pop, Reggae, Rock | Number of Estimators: 500 | Learning Rate: 0.05 | Accuracy: 0.6987148976677773
Genres: Blues, Country, Jazz, Metal, Pop, Reggae, Rock | Number of Estimators: 500 | Learning Rate: 0.01 | Accuracy: 0.5992384578772013
Genres: Blues, Country, Jazz, Metal, Pop, Reggae, Rock | Number of Estimators: 1000 | Learning Rate: 0.1 | Accuracy: 0.7277486910994765
Genres: Blues, Country, Jazz, Metal, Pop, Reggae, Rock | Number of Estimators: 1000 | Learning Rate: 0.05 | Accuracy: 0.7253688719657306
Genres: Blues, Country, Jazz, Metal, Pop, Reggae, Rock | Number of Estimators: 1000 | Learning Rate: 0.01 | Accuracy: 0.6530223702998572
Genres: Blues, Country, Jazz, Metal, Pop, Reggae, Rock | Number of Estimators: 1500 | Learning Rate: 0.1 | Accuracy: 0.7277486910994765
Genres: Blues, Country, Jazz, Metal, Pop, Reggae