In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import numpy as np

# Cargar datos

In [2]:
df = pd.read_csv('musical_skills_smote_final.csv')
display(df.head())

Unnamed: 0,skill_level,accuracy_easy,accuracy_medium,accuracy_hard,avg_response_time,games_played,avg_session_duration
0,intermedio,0.797,0.636,0.397,0.097667,-0.562069,-1.157247
1,principiante,0.74,0.546,0.4,1.244923,-0.70878,0.615031
2,principiante,0.617766,0.445416,0.300894,0.919455,-0.837783,-0.342893
3,intermedio,0.787,0.65,0.434,-0.341517,-0.305324,-0.415363
4,principiante,0.847,0.536,0.363,1.486922,-0.818814,0.216612


In [3]:
numerical_features = ['accuracy_easy', 'accuracy_medium', 'accuracy_hard', 'avg_response_time', 'games_played', 'avg_session_duration']
X = df[numerical_features]
y = df['skill_level']

display(X.head())
display(y.head())

Unnamed: 0,accuracy_easy,accuracy_medium,accuracy_hard,avg_response_time,games_played,avg_session_duration
0,0.797,0.636,0.397,0.097667,-0.562069,-1.157247
1,0.74,0.546,0.4,1.244923,-0.70878,0.615031
2,0.617766,0.445416,0.300894,0.919455,-0.837783,-0.342893
3,0.787,0.65,0.434,-0.341517,-0.305324,-0.415363
4,0.847,0.536,0.363,1.486922,-0.818814,0.216612


0      intermedio
1    principiante
2    principiante
3      intermedio
4    principiante
Name: skill_level, dtype: object

# Verificar métricas

In [4]:
# Initialize StratifiedKFold
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Lists to store metrics for each fold
conf_matrices = []
fold_accuracy = []
fold_precision = []
fold_recall = []

class_labels = sorted(y.unique(), reverse=True)  # Para mantener el orden

# Iterate through the folds
for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
    print(f"Processing Fold {fold + 1}/{n_splits}")

    # Split data into fold-specific training and validation sets
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

    # Initialize and train SVC model for the current fold
    # Use probability=True for potential future calibration steps
    svm_model_fold = SVC(random_state=42)
    svm_model_fold.fit(X_train_fold, y_train_fold)

    # Predict on the validation data for the current fold
    y_val_pred = svm_model_fold.predict(X_val_fold)

    # Confusion matrix for the current fold
    cm = confusion_matrix(y_val_fold, y_val_pred, labels=class_labels)
    conf_matrices.append(cm)

    # Calculate metrics for the current fold
    accuracy = accuracy_score(y_val_fold, y_val_pred)
    precision = precision_score(y_val_fold, y_val_pred, average=None, labels=class_labels, zero_division=0)
    recall = recall_score(y_val_fold, y_val_pred, average=None, labels=class_labels, zero_division=0)

    # Store metrics for the current fold
    fold_accuracy.append(accuracy)
    fold_precision.append(precision)
    fold_recall.append(recall)

Processing Fold 1/5
Processing Fold 2/5
Processing Fold 3/5
Processing Fold 4/5
Processing Fold 5/5


In [5]:
# Calculate mean and standard deviation of accuracy
mean_accuracy = np.mean(fold_accuracy)
std_accuracy = np.std(fold_accuracy)

# Classification error = 1 - accuracy
mean_error = 1 - mean_accuracy
std_error = std_accuracy 

# Calculate mean confusion matrix across all folds
mean_cm = np.mean(conf_matrices, axis=0).astype(int)

# Calculate mean precision and recall across all folds
mean_precision = np.mean(fold_precision, axis=0)
mean_recall = np.mean(fold_recall, axis=0)

In [6]:
# Create DataFrame for performance metrics
df_performance = pd.DataFrame({
    "Criterion": ["Accuracy", "Classification Error"],
    "Value": [f"{mean_accuracy * 100:.1f}%", f"{mean_error * 100:.1f}%"],
    "Standard Deviation": [f"± {std_accuracy * 100:.1f}%", f"± {std_error * 100:.1f}%"]
})

display(df_performance)

Unnamed: 0,Criterion,Value,Standard Deviation
0,Accuracy,98.7%,± 0.4%
1,Classification Error,1.3%,± 0.4%


In [7]:

# Create DataFrame for confusion matrix with precision and recall
df_conf = pd.DataFrame(mean_cm, 
                       index=[f"true {label}" for label in class_labels], 
                       columns=[f"pred {label}" for label in class_labels])

df_conf["class recall"] = [f"{r*100:.2f}%" for r in mean_recall]
df_conf.loc["class precision"] = [f"{p*100:.2f}%" for p in mean_precision] + [""]

display(df_conf)

Unnamed: 0,pred principiante,pred intermedio,pred experto,class recall
true principiante,89,0,0,99.12%
true intermedio,1,88,0,98.01%
true experto,0,1,89,98.90%
class precision,98.48%,98.24%,99.33%,


# Modelo de producción

In [8]:
svm = SVC(random_state=42)
svm.fit(X, y)

# Número de vectores de soporte por clase
print("VECTORES DE SOPORTE")
for clase, num_vecs in zip(svm.classes_, svm.n_support_):
    print(f"Clase {clase}: {num_vecs} vectores de soporte")


VECTORES DE SOPORTE
Clase experto: 49 vectores de soporte
Clase intermedio: 90 vectores de soporte
Clase principiante: 47 vectores de soporte
