In [21]:
# ### Step 0: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [22]:
# ### Step 1: Load the Data
print("Loading data...")
# Make sure your CSV files are in the same directory as this notebook, or provide the full path.
tracks_path = '../dataset/tracks.csv'
features_path = '../dataset/features.csv'

# Load the data with the correct multi-level headers
tracks = pd.read_csv(tracks_path, index_col=0, header=[0, 1])
features = pd.read_csv(features_path, index_col=0, header=[0, 1, 2])
print("Data loaded successfully.")

Loading data...
Data loaded successfully.


In [23]:
# ### Step 2: Prepare the DataFrame (The Definitive, Robust Method)
print("Preparing the dataset with a robust pipeline...")

# 1. Get the track IDs for the 'small' subset from the tracks metadata.
small_tracks_ids = tracks[tracks[('set', 'subset')] == 'small'].index

# 2. Find the intersection of track IDs that exist in BOTH the features and the small_tracks list.
# This guarantees our data is perfectly aligned.
common_ids = features.index.intersection(small_tracks_ids)

# 3. Filter both DataFrames to only include these common, perfectly matched tracks.
small_features = features.loc[common_ids]
small_genres = tracks.loc[common_ids][('track', 'genre_top')]

# 4. Create our final DataFrame for modeling.
df_final = small_features.copy()

# --- THIS IS THE KEY FIX ---
# 5. Flatten the multi-level column headers from the features FIRST.
df_final.columns = ['_'.join(str(c) for c in col).strip() for col in df_final.columns.values]

# 6. NOW, add the genre column to the DataFrame that has simple columns.
#    Using .values helps prevent potential index misalignment issues.
df_final['genre'] = small_genres.values
# --- END OF FIX ---

print("Dataset prepared correctly.")
print(f"Shape of our modeling dataset: {df_final.shape}")



Preparing the dataset with a robust pipeline...
Dataset prepared correctly.
Shape of our modeling dataset: (8000, 519)


In [24]:
# ### Step 3: Define Features (X) and Target (y)
# This will now work correctly because the 'genre' column exists.
X = df_final.drop('genre', axis=1)
y = df_final['genre']

In [25]:
# ### Step 4: Preprocessing (Encoding, Splitting, Scaling)
print("Preprocessing data...")
# Encode the Genre Labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the Data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Scale the Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Preprocessing data...


In [26]:
# ### Step 5: Train and Evaluate the Random Forest Model
print("Training the Random Forest model...")
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train_scaled, y_train)
print("Model training complete.")

Training the Random Forest model...
Model training complete.


In [27]:
# ### Step 6: Evaluate the Model
print("\n--- Model Evaluation ---")
predictions = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, predictions)
print(f"Final Model Accuracy with Correct Pipeline: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test, predictions, target_names=label_encoder.classes_))




--- Model Evaluation ---
Final Model Accuracy with Correct Pipeline: 55.81%

Classification Report:
               precision    recall  f1-score   support

   Electronic       0.59      0.57      0.58       200
 Experimental       0.53      0.40      0.46       200
         Folk       0.62      0.75      0.68       200
      Hip-Hop       0.58      0.67      0.62       200
 Instrumental       0.56      0.58      0.57       200
International       0.60      0.60      0.60       200
          Pop       0.34      0.28      0.31       200
         Rock       0.58      0.62      0.60       200

     accuracy                           0.56      1600
    macro avg       0.55      0.56      0.55      1600
 weighted avg       0.55      0.56      0.55      1600

