In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import itertools
import os

# List of all possible genres
all_genres = ['Blues', 'Country', 'Electronic', 'Folk', 'Jazz', 'Latin', 'Metal', 'Pop', 'Punk', 'Rap', 'Reggae', 'RnB', 'Rock', 'World', 'New Age']

# Function to load the dataset for a given set of genres and perform evaluation
def evaluate_xgb_with_genres(genres, num_estimators, learning_rate):
    # Create the file name based on the genre combination
    file_names = [f'../GeneratedData/3secondsData/{genre}_dataset.csv' for genre in genres]
    
    # Load the corresponding datasets for the selected genres
    data_frames = []
    for file_name in file_names:
        if os.path.exists(file_name):
            data_frames.append(pd.read_csv(file_name))
        else:
            print(f"Warning: {file_name} not found.")
    
    if not data_frames:
        print(f"No datasets found for {genres}. Skipping combination.")
        return None

    # Concatenate all genre datasets into a single dataframe
    data = pd.concat(data_frames, ignore_index=True)
    
    # Separate features and target (label)
    y = data['label']
    X = data.drop(columns=['label', 'TrackID', 'Genre', 'filename'])

    # Normalize X
    from sklearn.preprocessing import MinMaxScaler
    min_max_scaler = MinMaxScaler()
    np_scaled = min_max_scaler.fit_transform(X)
    X = pd.DataFrame(np_scaled, columns=X.columns)

    # Split the data (20-30% testing split)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    # Encode the labels
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    y_test_encoded = le.transform(y_test)

    # Define the XGBoost model with the given hyperparameters
    xgb = XGBClassifier(n_estimators=num_estimators, learning_rate=learning_rate, random_state=42)

    # Train the model
    xgb.fit(X_train, y_train_encoded)

    # Predict and evaluate accuracy
    preds = xgb.predict(X_test)
    accuracy = accuracy_score(y_test_encoded, preds)

    # Return the results
    return accuracy, genres, num_estimators, learning_rate

# Generate all combinations of genres with at least 8 genres in each combination
genre_combinations = [list(combo) for i in range(8, len(all_genres) + 1) for combo in itertools.combinations(all_genres, i)]

# Combinations of parameters to test
num_estimators_list = [500, 1000, 1500,5000,10000]
learning_rate_list = [0.01, 0.05, 0.1,0.025,0.001]

# List to store results
results = []

# Loop through each combination of genres, number of estimators, and learning rate
for genres in genre_combinations:
    for num_estimators in num_estimators_list:
        for learning_rate in learning_rate_list:
            result = evaluate_xgb_with_genres(genres, num_estimators, learning_rate)
            if result:
                accuracy, genres_used, n_estimators, lr = result
                results.append({
                    'Genres': ', '.join(genres_used),
                    'Number of Estimators': n_estimators,
                    'Learning Rate': lr,
                    'Accuracy': accuracy
                })
                print(f"Genres: {', '.join(genres_used)} | "
                      f"Number of Estimators: {n_estimators} | "
                      f"Learning Rate: {lr} | "
                      f"Accuracy: {accuracy}")

# Save the results to a DataFrame and write to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('../GeneratedData/Model_Results_XGB_All_Combinations.csv', index=False)

print('Model evaluation complete. Results saved to Model_Results_XGB_All_Combinations.csv')



Genres: Blues, Country, Electronic, Folk, Jazz, Latin, Metal, Pop | Number of Estimators: 500 | Learning Rate: 0.01 | Accuracy: 0.6340489266100848
Genres: Blues, Country, Electronic, Folk, Jazz, Latin, Metal, Pop | Number of Estimators: 500 | Learning Rate: 0.05 | Accuracy: 0.7319021467798302
Genres: Blues, Country, Electronic, Folk, Jazz, Latin, Metal, Pop | Number of Estimators: 500 | Learning Rate: 0.1 | Accuracy: 0.7503744383424863
Genres: Blues, Country, Electronic, Folk, Jazz, Latin, Metal, Pop | Number of Estimators: 500 | Learning Rate: 0.025 | Accuracy: 0.6939590614078882
Genres: Blues, Country, Electronic, Folk, Jazz, Latin, Metal, Pop | Number of Estimators: 500 | Learning Rate: 0.001 | Accuracy: 0.49675486769845234
Genres: Blues, Country, Electronic, Folk, Jazz, Latin, Metal, Pop | Number of Estimators: 1000 | Learning Rate: 0.01 | Accuracy: 0.6794807788317524
Genres: Blues, Country, Electronic, Folk, Jazz, Latin, Metal, Pop | Number of Estimators: 1000 | Learning Rate: 0.0

KeyboardInterrupt: 