In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils.class_weight import compute_class_weight

In [2]:
# Step 1: Load and Preprocess Dataset
file_path = "../Data/features_30_sec.csv"  # 训练数据集
data = pd.read_csv(file_path)
data.drop(labels="filename", axis=1, inplace=True)

In [3]:
# Separate features and target
X = data.iloc[:, :-1]  # All columns except the last one (features)
y = data["label"]  # Target column

In [4]:
# Encode target labels to numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Compute class weights to handle potential imbalance
class_weights = compute_class_weight(
    "balanced", classes=np.unique(y_encoded), y=y_encoded
)
class_weight_dict = dict(zip(np.unique(y_encoded), class_weights))

# Convert class_weight_dict to an array that will be used for sample_weight
sample_weights = np.array([class_weight_dict[class_label] for class_label in y_encoded])

# Step 2: Train the XGBoost Model using Stratified Cross-Validation
model = xgb.XGBClassifier(
    n_estimators=200, random_state=42, learning_rate=0.01, max_depth=6, subsample=0.8
)


In [5]:
# Stratified K-Fold Cross-Validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = []

for train_idx, test_idx in kfold.split(X_scaled, y_encoded):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y_encoded[train_idx], y_encoded[test_idx]

    # Generate sample weights for the current split
    train_sample_weights = sample_weights[train_idx]

    # Fit model with class weights (sample_weight)
    model.fit(X_train, y_train, sample_weight=train_sample_weights)

    # Evaluate model
    y_pred = model.predict(X_test)
    cv_results.append(accuracy_score(y_test, y_pred))

print(f"Cross-validation accuracy: {np.mean(cv_results):.4f}")


Cross-validation accuracy: 0.7730


In [6]:
# Step 3: Train on the Entire Dataset and Compute Overall Metrics
model.fit(X_scaled, y_encoded, sample_weight=sample_weights)

# Predict on the entire dataset for confusion matrix and accuracy
y_pred_overall = model.predict(X_scaled)

# Compute accuracy
overall_accuracy = accuracy_score(y_encoded, y_pred_overall)
print(f"\nOverall Model Accuracy: {overall_accuracy:.4f}")

# Compute confusion matrix
cm_overall = confusion_matrix(y_encoded, y_pred_overall)
print("\nOverall Confusion Matrix:")
print(cm_overall)


Overall Model Accuracy: 0.9920

Overall Confusion Matrix:
[[100   0   0   0   0   0   0   0   0   0]
 [  0 100   0   0   0   0   0   0   0   0]
 [  0   0 100   0   0   0   0   0   0   0]
 [  0   0   1  98   1   0   0   0   0   0]
 [  0   0   0   0  99   0   1   0   0   0]
 [  0   0   1   0   0  99   0   0   0   0]
 [  0   0   0   0   0   0  99   0   0   1]
 [  0   0   0   0   0   0   0 100   0   0]
 [  0   0   1   0   0   0   0   1  98   0]
 [  0   0   1   0   0   0   0   0   0  99]]


In [7]:
# Display Classification Report
print("\nClassification Report:")
print(
    classification_report(
        y_encoded, y_pred_overall, target_names=label_encoder.classes_
    )
)


Classification Report:
              precision    recall  f1-score   support

       blues       1.00      1.00      1.00       100
   classical       1.00      1.00      1.00       100
     country       0.96      1.00      0.98       100
       disco       1.00      0.98      0.99       100
      hiphop       0.99      0.99      0.99       100
        jazz       1.00      0.99      0.99       100
       metal       0.99      0.99      0.99       100
         pop       0.99      1.00      1.00       100
      reggae       1.00      0.98      0.99       100
        rock       0.99      0.99      0.99       100

    accuracy                           0.99      1000
   macro avg       0.99      0.99      0.99      1000
weighted avg       0.99      0.99      0.99      1000

