In [7]:
# --- Step 0: Import Libraries --- #
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [8]:
# --- Step 1: Load the Pre-Processed GTZAN Data --- #
print("Loading the clean, pre-processed GTZAN dataset...")
file_path = '../datasets/features_30_sec.csv'
df = pd.read_csv(file_path)
print("Data loaded successfully.")

Loading the clean, pre-processed GTZAN dataset...
Data loaded successfully.


In [9]:
# --- Step 2: Prepare the DataFrame ---#
print("Preparing the dataset...")
X = df.drop(['filename', 'label', 'length'], axis=1)
y = df['label']
print("Dataset prepared.")

Preparing the dataset...
Dataset prepared.


In [10]:
# --- Step 3: Preprocessing (Encoding, Splitting, Scaling) --- #
print("Preprocessing data...")
# Encode the Genre Labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the Data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Scale the Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Preprocessing complete.")

Preprocessing data...
Preprocessing complete.


In [11]:
# --- Step 4: Train and Evaluate the BASELINE Random Forest Model --- #
print("Training the BASELINE Random Forest model...")
baseline_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
baseline_model.fit(X_train_scaled, y_train)
baseline_predictions = baseline_model.predict(X_test_scaled)
baseline_accuracy = accuracy_score(y_test, baseline_predictions)
print(f"Baseline Model Accuracy: {baseline_accuracy * 100:.2f}%")

Training the BASELINE Random Forest model...
Baseline Model Accuracy: 69.00%


In [12]:
# --- Step 5: NEW - Hyperparameter Tuning with GridSearchCV --- #
print("\nStarting Hyperparameter Tuning (this may take a few minutes)...")

# Define the "grid" of hyperparameters to test
# These are some of the most important settings for a Random Forest
param_grid = {
    'n_estimators': [100, 200],       # Number of trees in the forest
    'max_depth': [10, 20, 30],      # Maximum depth of the trees
    'min_samples_leaf': [1, 2],     # Minimum number of samples required at a leaf node
    'min_samples_split': [2, 5]     # Minimum number of samples required to split a node
}

# Create the GridSearchCV object
# cv=3 means it will use 3-fold cross-validation
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid=param_grid,
    cv=3,
    verbose=2  # Print progress updates
)

# Fit the grid search to the data
grid_search.fit(X_train_scaled, y_train)

print("\nHyperparameter Tuning complete.")
print(f"Best parameters found: {grid_search.best_params_}")


Starting Hyperparameter Tuning (this may take a few minutes)...
Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_

In [13]:
# --- Step 6: Evaluate the BEST Model --- #
print("\n--- Final Model Evaluation ---")
# Get the best model found by the grid search
best_model = grid_search.best_estimator_

# Make predictions on the test set
predictions = best_model.predict(X_test_scaled)
final_accuracy = accuracy_score(y_test, predictions)

print(f"\nFinal Tuned Model Accuracy: {final_accuracy * 100:.2f}%")


--- Final Model Evaluation ---

Final Tuned Model Accuracy: 69.50%
