In [74]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

**Split**

In [75]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

# read dataset
df = pd.read_csv("training_data_75.csv")

# Target for classification is the threshold value
y = df["min_threshold"].astype(int).values

# Group (so all rows of same circuit stay together)
groups = df["file"].astype(str).values

# Drop columns we dont want as features
drop_cols = [
    "min_threshold",   # target
    "file",
    "family",
    "forward_runtime", # not for classification, only regression
    "max_fidelity_achieved",
    "forward_shots",
    "forward_peak_rss_mb",
    "n_thresholds_tested",
]
drop_cols = [c for c in drop_cols if c in df.columns]

# X is equal to the whole dataset - dropped columns
X = df.drop(columns=drop_cols).copy()

# encode categorical columns (backend/precision/etc.)
X = pd.get_dummies(X, columns=[c for c in X.columns if X[c].dtype == "object" or X[c].dtype == "str"])

# ---------------------------
# Stratified split BY FILE, stratified by n_qubits bucket
# ---------------------------

# 1) Build a file-level table for stratification
file_info = df.groupby("file", as_index=False).agg(
    n_qubits=("n_qubits", "first")
)

# Bucketize n_qubits so stratification is stable (avoids classes with only 1 file)
file_info["qubit_bucket"] = pd.cut(
    file_info["n_qubits"],
    bins=[-1, 20, 60, 10**9],
    labels=["small", "medium", "large"]
)

# 2) Optional: force rare-threshold files into TRAIN (helps avoid "unseen class 256")
forced_train_files = set(df.loc[df["min_threshold"] == 256, "file"].unique())

# Split only on remaining files (pool)
pool = file_info[~file_info["file"].isin(forced_train_files)].reset_index(drop=True)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
train_f_idx, test_f_idx = next(sss.split(pool["file"], pool["qubit_bucket"]))

train_files = set(pool.loc[train_f_idx, "file"])
test_files  = set(pool.loc[test_f_idx, "file"])

# Add forced files to train
train_files |= forced_train_files

# Convert file sets -> row indices
train_idx = df.index[df["file"].isin(train_files)].to_numpy()
test_idx  = df.index[df["file"].isin(test_files)].to_numpy()

# Final arrays
x_train = X.iloc[train_idx].values.astype(np.float32)
x_test  = X.iloc[test_idx].values.astype(np.float32)
y_train = y[train_idx]
y_test  = y[test_idx]

# sanity checks
print("Shapes:", x_train.shape, x_test.shape)
print("Train classes:", sorted(np.unique(y_train)))
print("Test classes:", sorted(np.unique(y_test)))

overlap = train_files.intersection(test_files)
print("Unique files train:", len(train_files), "test:", len(test_files), "overlap:", len(overlap))


Shapes: (102, 66) (35, 66)
Train classes: [np.int64(1), np.int64(2), np.int64(4), np.int64(8), np.int64(16), np.int64(64)]
Test classes: [np.int64(1), np.int64(2), np.int64(4)]
Unique files train: 27 test: 9 overlap: 0


**Metricas**

In [76]:
import numpy as np
from sklearn.metrics import accuracy_score

def cls_metrics(y_true, y_pred, name="model"):
    y_true = np.asarray(y_true).astype(int)
    y_pred = np.asarray(y_pred).astype(int)

    acc = accuracy_score(y_true, y_pred)
    under = np.mean(y_pred < y_true)   # super importante en tu reto
    over  = np.mean(y_pred > y_true)

    #print(f"{name}")
    #print("  Accuracy:", round(acc, 4))
    #print("  Under-rate (pred < true):", round(float(under), 4))
    #print("  Over-rate  (pred > true):", round(float(over), 4))
    
    return acc


In [77]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings

# Load fresh data to ensure we have everything
df = pd.read_csv("training_data_75.csv")
y = df["min_threshold"].astype(int).values
groups = df["file"].astype(str).values  # CRITICAL: group by file to prevent leakage

drop_cols = ["min_threshold", "file", "family", "forward_runtime", 
             "max_fidelity_achieved", "forward_shots", "forward_peak_rss_mb", "n_thresholds_tested"]
drop_cols = [c for c in drop_cols if c in df.columns]

X = df.drop(columns=drop_cols).copy()
X = pd.get_dummies(X, columns=X.select_dtypes(exclude=[np.number]).columns.tolist())
X_arr = X.values.astype(np.float32)

print(f"Data: {X_arr.shape[0]} samples, {X_arr.shape[1]} features")
print(f"Files: {len(np.unique(groups))} unique circuits (~{len(y)/len(np.unique(groups)):.1f} samples each)")
print(f"Classes: {sorted(np.unique(y))}")
print()

# Pipeline with scaling (important for SVM)
pipe = Pipeline([
    ("scaler", RobustScaler()),
    ("clf", LinearSVC(C=1.0, class_weight="balanced", random_state=42, max_iter=5000)),
])

# CORRECT: StratifiedGroupKFold - keeps all samples from same file together
min_class_count = min(np.bincount(y)[np.bincount(y) > 0])
n_splits = min(3, min_class_count)  # Can't have more splits than smallest class

print(f"Using StratifiedGroupKFold with {n_splits} splits (grouped by file)")
print()

sgkf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)

fold_scores = []
all_true, all_pred = [], []

with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    
    for fold_idx, (train_idx, test_idx) in enumerate(sgkf.split(X_arr, y, groups)):
        X_train, X_test = X_arr[train_idx], X_arr[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        # Check no file overlap (sanity check)
        train_files = set(groups[train_idx])
        test_files = set(groups[test_idx])
        overlap = train_files & test_files
        
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        
        acc = accuracy_score(y_test, y_pred)
        fold_scores.append(acc)
        all_true.extend(y_test)
        all_pred.extend(y_pred)
        
        print(f"Fold {fold_idx+1}: Acc={acc:.4f}, Test classes={sorted(np.unique(y_test))}, "
              f"n_test={len(y_test)}, file_overlap={len(overlap)}")

print()
print(f"LinearSVC (PROPER GroupKFold):")
print(f"  Mean Accuracy: {np.mean(fold_scores):.4f} Â± {np.std(fold_scores):.4f}")

# Confusion matrix
classes = sorted(np.unique(y))
cm = confusion_matrix(all_true, all_pred, labels=classes)
print(f"\nConfusion Matrix:")
print(f"Classes: {classes}")
print(cm)

Data: 137 samples, 66 features
Files: 36 unique circuits (~3.8 samples each)
Classes: [np.int64(1), np.int64(2), np.int64(4), np.int64(8), np.int64(16), np.int64(64)]

Using StratifiedGroupKFold with 2 splits (grouped by file)

Fold 1: Acc=0.5352, Test classes=[np.int64(1), np.int64(2), np.int64(4), np.int64(8), np.int64(16)], n_test=71, file_overlap=0
Fold 2: Acc=0.5152, Test classes=[np.int64(1), np.int64(2), np.int64(4), np.int64(8), np.int64(16), np.int64(64)], n_test=66, file_overlap=0

LinearSVC (PROPER GroupKFold):
  Mean Accuracy: 0.5252 Â± 0.0100

Confusion Matrix:
Classes: [np.int64(1), np.int64(2), np.int64(4), np.int64(8), np.int64(16), np.int64(64)]
[[46  7  0  1  0  0]
 [ 6 26  0  2 12  0]
 [ 3  3  0  0  0  0]
 [ 0  8  8  0  1  0]
 [ 0  4  4  0  0  4]
 [ 0  0  0  0  2  0]]


## Feature Engineering

Create domain-specific features to improve model accuracy.

In [78]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import RobustScaler
from catboost import CatBoostClassifier
import warnings

# Load data
df = pd.read_csv("training_data_75.csv")

def engineer_features(df):
    """Create domain-specific features for quantum circuit threshold prediction."""
    X = df.copy()
    
    # ============================================
    # 1. INTERACTION FEATURES (top correlated features combined)
    # ============================================
    
    # Qubit degree interactions (avg_qubit_degree is top predictor)
    X['degree_x_qubits'] = X['avg_qubit_degree'] * X['n_qubits']
    X['degree_x_depth'] = X['avg_qubit_degree'] * X['crude_depth']
    X['degree_x_2q'] = X['avg_qubit_degree'] * X['n_2q_gates']
    
    # Entanglement complexity
    X['entanglement_complexity'] = X['n_unique_edges'] * X['avg_qubit_degree']
    X['entanglement_per_qubit'] = X['n_unique_edges'] / (X['n_qubits'] + 1)
    
    # ============================================
    # 2. RATIO FEATURES (relationships between properties)
    # ============================================
    
    # Gate composition ratios
    X['cx_ratio'] = X['n_cx'] / (X['n_total_gates'] + 1)
    X['rotation_ratio'] = X['n_rotation_gates'] / (X['n_total_gates'] + 1)
    X['multi_qubit_ratio'] = (X['n_2q_gates'] + X['n_3q_gates']) / (X['n_total_gates'] + 1)
    
    # Depth-related ratios
    X['gates_per_depth'] = X['n_total_gates'] / (X['crude_depth'] + 1)
    X['depth_per_qubit'] = X['crude_depth'] / (X['n_qubits'] + 1)
    
    # Connectivity ratios
    X['edge_density'] = X['n_unique_edges'] / (X['n_qubits'] * (X['n_qubits'] - 1) / 2 + 1)
    X['edge_repetition_ratio'] = X['n_edge_repetitions'] / (X['n_unique_edges'] + 1)
    
    # ============================================
    # 3. POLYNOMIAL FEATURES (non-linear relationships)
    # ============================================
    
    X['degree_squared'] = X['avg_qubit_degree'] ** 2
    X['qubits_squared'] = X['n_qubits'] ** 2
    X['depth_squared'] = X['crude_depth'] ** 2
    X['log_qubits'] = np.log1p(X['n_qubits'])
    X['log_depth'] = np.log1p(X['crude_depth'])
    X['log_gates'] = np.log1p(X['n_total_gates'])
    
    # ============================================
    # 4. CIRCUIT COMPLEXITY SCORES
    # ============================================
    
    # Overall complexity score
    X['complexity_score'] = (
        X['n_qubits'] * X['crude_depth'] * X['avg_qubit_degree'] / 1000
    )
    
    # Entanglement burden
    X['entanglement_burden'] = (
        X['n_2q_gates'] * X['avg_qubit_degree'] / (X['n_qubits'] + 1)
    )
    
    # Simulation difficulty proxy
    X['sim_difficulty'] = (
        X['n_qubits'] ** 1.5 * X['entanglement_pressure']
    )
    
    # ============================================
    # 5. PATTERN-BASED FEATURES
    # ============================================
    
    # Combined pattern indicator
    X['n_patterns'] = (
        X['has_qft_pattern'] + X['has_iqft_pattern'] + 
        X['has_grover_pattern'] + X['has_variational_pattern'] + X['has_ghz_pattern']
    )
    
    # Variational circuit complexity
    X['variational_complexity'] = X['has_variational_pattern'] * X['n_rotation_gates']
    
    return X

# Apply feature engineering
X_eng = engineer_features(df)

# Drop non-feature columns
drop_cols = ["min_threshold", "file", "family", "forward_runtime", 
             "max_fidelity_achieved", "forward_shots", "forward_peak_rss_mb", "n_thresholds_tested"]
drop_cols = [c for c in drop_cols if c in X_eng.columns]
X_eng = X_eng.drop(columns=drop_cols)

# One-hot encode categoricals
cat_cols = X_eng.select_dtypes(exclude=[np.number]).columns.tolist()
X_eng = pd.get_dummies(X_eng, columns=cat_cols)

print(f"Original features: 64")
print(f"Engineered features: {X_eng.shape[1]}")
print(f"New features added: {X_eng.shape[1] - 64}")
print()

# Show new feature correlations
y = df["min_threshold"].values
new_feat_cols = [c for c in X_eng.columns if c not in df.columns]
print("New features correlation with target:")
for col in new_feat_cols[:15]:
    if col in X_eng.columns:
        corr = np.corrcoef(X_eng[col].values, y)[0, 1]
        print(f"  {col:30s}: {corr:+.3f}")

Original features: 64
Engineered features: 89
New features added: 25

New features correlation with target:
  degree_x_qubits               : +0.747
  degree_x_depth                : -0.074
  degree_x_2q                   : -0.059
  entanglement_complexity       : +0.928
  entanglement_per_qubit        : +0.757
  cx_ratio                      : +0.144
  rotation_ratio                : +0.046
  multi_qubit_ratio             : -0.013
  gates_per_depth               : -0.094
  depth_per_qubit               : -0.091
  edge_density                  : +0.471
  edge_repetition_ratio         : -0.102
  degree_squared                : +0.897
  qubits_squared                : -0.146
  depth_squared                 : -0.086


In [79]:
# Evaluate with engineered features - Compare multiple models
from xgboost import XGBClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier

y_raw = df["min_threshold"].astype(int).values
groups = df["file"].astype(str).values
X_arr = X_eng.values.astype(np.float32)

# Handle any NaN/inf from division
X_arr = np.nan_to_num(X_arr, nan=0.0, posinf=0.0, neginf=0.0)

# Encode labels for XGBoost (needs 0, 1, 2, ... not 1, 2, 4, 8, ...)
le = LabelEncoder()
y = le.fit_transform(y_raw)
print(f"Label mapping: {dict(zip(le.classes_, range(len(le.classes_))))}")
print()

min_class = min(np.bincount(y)[np.bincount(y) > 0])
n_splits = min(3, min_class)

sgkf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Models to compare
models = {
    "CatBoost": CatBoostClassifier(
        iterations=500, depth=6, learning_rate=0.05,
        random_seed=42, verbose=False
    ),
    "XGBoost": XGBClassifier(
        n_estimators=500, max_depth=6, learning_rate=0.05,
        random_state=42, verbosity=0
    ),
    "LightGBM": LGBMClassifier(
        n_estimators=500, max_depth=6, learning_rate=0.05,
        random_state=42, verbose=-1
    ),
    "HistGradientBoosting": HistGradientBoostingClassifier(
        max_iter=500, max_depth=6, learning_rate=0.05,
        random_state=42
    ),
    "RandomForest": RandomForestClassifier(
        n_estimators=500, max_depth=10, min_samples_leaf=2,
        class_weight='balanced', random_state=42, n_jobs=-1
    ),
}

print(f"Evaluating models with {X_arr.shape[1]} engineered features...")
print()

results = {}

with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    
    for name, clf in models.items():
        fold_scores = []
        
        for fold_idx, (train_idx, test_idx) in enumerate(sgkf.split(X_arr, y, groups)):
            # Clone the model
            clf_fold = clf.__class__(**clf.get_params())
            clf_fold.fit(X_arr[train_idx], y[train_idx])
            y_pred = clf_fold.predict(X_arr[test_idx])
            
            acc = accuracy_score(y[test_idx], y_pred)
            fold_scores.append(acc)
        
        mean_acc = np.mean(fold_scores)
        std_acc = np.std(fold_scores)
        results[name] = {"mean": mean_acc, "std": std_acc, "folds": fold_scores}
        print(f"  {name:25s}: {mean_acc:.4f} Â± {std_acc:.4f}")

# Sort by accuracy
print()
print("=" * 50)
print("RANKED RESULTS (with engineered features):")
print("=" * 50)
for name, res in sorted(results.items(), key=lambda x: -x[1]["mean"]):
    marker = "ðŸ‘‘" if res["mean"] == max(r["mean"] for r in results.values()) else "  "
    print(f"{marker} {name:25s}: {res['mean']:.4f} Â± {res['std']:.4f}")

print()
print(f"Baseline (CatBoost without engineering): ~0.6566")
best_name = max(results.items(), key=lambda x: x[1]["mean"])[0]
best_acc = results[best_name]["mean"]
print(f"Best improvement: {(best_acc - 0.6566)*100:+.1f}%")

Label mapping: {np.int64(1): 0, np.int64(2): 1, np.int64(4): 2, np.int64(8): 3, np.int64(16): 4, np.int64(64): 5}

Evaluating models with 89 engineered features...

  CatBoost                 : 0.5467 Â± 0.1805
  XGBoost                  : 0.4840 Â± 0.0615
  LightGBM                 : 0.5079 Â± 0.0837
  HistGradientBoosting     : 0.5425 Â± 0.0636
  RandomForest             : 0.7200 Â± 0.1285

RANKED RESULTS (with engineered features):
ðŸ‘‘ RandomForest             : 0.7200 Â± 0.1285
   CatBoost                 : 0.5467 Â± 0.1805
   HistGradientBoosting     : 0.5425 Â± 0.0636
   LightGBM                 : 0.5079 Â± 0.0837
   XGBoost                  : 0.4840 Â± 0.0615

Baseline (CatBoost without engineering): ~0.6566
Best improvement: +6.3%


In [80]:
# Try feature selection - keep only the most important features
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

# Train a CatBoost model to get feature importances
clf_importance = CatBoostClassifier(
    iterations=500, depth=6, learning_rate=0.05,
    random_seed=42, verbose=False
)
clf_importance.fit(X_arr, y)

feature_importance = clf_importance.get_feature_importance()
feature_names = X_eng.columns.tolist()

# Sort by importance
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("Top 20 most important features:")
for idx, row in importance_df.head(20).iterrows():
    print(f"  {row['feature']:35s}: {row['importance']:.2f}")

# Try with only top K features
print()
print("Feature selection results:")
for k in [10, 20, 30, 40]:
    top_features = importance_df.head(k)['feature'].tolist()
    X_selected = X_eng[top_features].values.astype(np.float32)
    X_selected = np.nan_to_num(X_selected, nan=0.0, posinf=0.0, neginf=0.0)
    
    fold_scores_sel = []
    for train_idx, test_idx in sgkf.split(X_selected, y, groups):
        clf_sel = CatBoostClassifier(iterations=500, depth=6, learning_rate=0.05, random_seed=42, verbose=False)
        clf_sel.fit(X_selected[train_idx], y[train_idx])
        y_pred = clf_sel.predict(X_selected[test_idx])
        fold_scores_sel.append(accuracy_score(y[test_idx], y_pred))
    
    print(f"  Top {k} features: {np.mean(fold_scores_sel):.4f} Â± {np.std(fold_scores_sel):.4f}")

Top 20 most important features:
  n_h                                : 7.83
  avg_gate_span                      : 6.95
  cx_ratio                           : 6.12
  n_cx                               : 4.72
  n_u2                               : 4.43
  crude_depth                        : 4.12
  std_gate_span                      : 3.20
  ratio_1q_gates                     : 2.90
  1q_gates_per_qubit                 : 2.83
  depth_squared                      : 2.38
  gates_per_layer_estimate           : 2.17
  entanglement_per_qubit             : 2.15
  midpoint_cut_crossings             : 2.09
  circuit_density                    : 1.92
  n_edge_repetitions                 : 1.90
  qubits_squared                     : 1.78
  degree_squared                     : 1.74
  qubit_degree_std                   : 1.66
  log_qubits                         : 1.66
  n_qubits                           : 1.55

Feature selection results:
  Top 10 features: 0.7439 Â± 0.0166
  Top 20 features: 0.687

In [81]:
# Proper K-Fold evaluation of RandomForest with top 10 features
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Get top 10 features by importance
top_k = 10
top_features = importance_df.head(top_k)['feature'].tolist()
X_top = X_eng[top_features].values.astype(np.float32)
X_top = np.nan_to_num(X_top, nan=0.0, posinf=0.0, neginf=0.0)

print(f"Top {top_k} features:")
for i, feat in enumerate(top_features, 1):
    print(f"  {i:2d}. {feat}")
print()

# Use maximum possible splits based on smallest class
min_class = min(np.bincount(y)[np.bincount(y) > 0])
n_splits = min(5, min_class)  # Try to get more folds if possible

sgkf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)

fold_scores = []
all_true, all_pred = [], []

print(f"Running {n_splits}-fold StratifiedGroupKFold (grouped by file)...")
print()

for fold_idx, (train_idx, test_idx) in enumerate(sgkf.split(X_top, y, groups)):
    # Verify no file overlap
    train_files = set(groups[train_idx])
    test_files = set(groups[test_idx])
    overlap = len(train_files & test_files)
    
    clf = RandomForestClassifier(
        n_estimators=500, 
        max_depth=10, 
        min_samples_leaf=2,
        class_weight='balanced', 
        random_state=42, 
        n_jobs=-1
    )
    clf.fit(X_top[train_idx], y[train_idx])
    y_pred = clf.predict(X_top[test_idx])
    
    acc = accuracy_score(y[test_idx], y_pred)
    fold_scores.append(acc)
    all_true.extend(y[test_idx])
    all_pred.extend(y_pred)
    
    test_classes = sorted(np.unique(y[test_idx]))
    print(f"  Fold {fold_idx+1}: Acc={acc:.4f}, n_test={len(test_idx)}, file_overlap={overlap}, classes={test_classes}")

mean_acc = np.mean(fold_scores)
std_acc = np.std(fold_scores)

print()
print("=" * 60)
print(f"RandomForest + Top {top_k} Features - FINAL RESULTS")
print("=" * 60)
print(f"  Mean Accuracy: {mean_acc:.4f} Â± {std_acc:.4f}")
print(f"  Per-fold: {[round(s, 4) for s in fold_scores]}")
print()
print(f"  Baseline (CatBoost, all features): ~0.6566")
print(f"  Improvement: {(mean_acc - 0.6566)*100:+.1f}%")
print()

# Confusion matrix with original labels
cm = confusion_matrix(all_true, all_pred)
print("Confusion Matrix:")
print(f"Classes (encoded): {sorted(np.unique(y))}")
print(f"Classes (original): {list(le.classes_)}")
print(cm)

print()
print("Classification Report:")
print(classification_report(all_true, all_pred, target_names=[str(c) for c in le.classes_]))

Top 10 features:
   1. n_h
   2. avg_gate_span
   3. cx_ratio
   4. n_cx
   5. n_u2
   6. crude_depth
   7. std_gate_span
   8. ratio_1q_gates
   9. 1q_gates_per_qubit
  10. depth_squared

Running 2-fold StratifiedGroupKFold (grouped by file)...

  Fold 1: Acc=0.7042, n_test=71, file_overlap=0, classes=[np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4)]
  Fold 2: Acc=0.6667, n_test=66, file_overlap=0, classes=[np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5)]

RandomForest + Top 10 Features - FINAL RESULTS
  Mean Accuracy: 0.6854 Â± 0.0188
  Per-fold: [0.7042, 0.6667]

  Baseline (CatBoost, all features): ~0.6566
  Improvement: +2.9%

Confusion Matrix:
Classes (encoded): [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5)]
Classes (original): [np.int64(1), np.int64(2), np.int64(4), np.int64(8), np.int64(16), np.int64(64)]
[[50  0  0  4  0  0]
 [ 6 36  4  0  0  0]
 [ 3  0  0  3  0  0]
 [ 5  4  0  8  0  0]
 [ 0  0  0  8 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## Optimizing for the Actual Scoring Function

The competition scoring is asymmetric:
- **Underpredict (pred < true)**: 0 points (catastrophic!)
- **Exact match**: 1 point
- **Overpredict (pred > true)**: `true/pred` points (e.g., 0.5 for 1 step too high)

Strategy: **When uncertain, predict higher to avoid zeros.**

In [85]:
# Evaluate with ACTUAL competition scoring function
import numpy as np

def competition_score(y_true, y_pred):
    """
    Calculate competition score.
    - Underpredict (pred < true): 0 points
    - Exact match: 1 point  
    - Overpredict (pred > true): true/pred points
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    scores = np.zeros(len(y_true))
    
    # Exact match
    exact = y_pred == y_true
    scores[exact] = 1.0
    
    # Overpredict (partial credit)
    over = y_pred > y_true
    scores[over] = y_true[over] / y_pred[over]
    
    # Underpredict (zero points) - already 0
    under = y_pred < y_true
    # scores[under] = 0  # already initialized to 0
    
    return scores

def evaluate_with_competition_score(y_true, y_pred, y_pred_proba=None, threshold_classes=None):
    """Detailed evaluation with competition scoring."""
    scores = competition_score(y_true, y_pred)
    
    n = len(y_true)
    exact = np.sum(y_pred == y_true)
    under = np.sum(y_pred < y_true)
    over = np.sum(y_pred > y_true)
    
    print(f"Competition Score: {scores.sum():.2f} / {n} = {scores.mean():.4f} per sample")
    print(f"  Exact matches:   {exact:3d} ({100*exact/n:5.1f}%) â†’ {exact:.1f} points")
    print(f"  Underpredictions:{under:3d} ({100*under/n:5.1f}%) â†’ 0.0 points (LOST)")
    print(f"  Overpredictions: {over:3d} ({100*over/n:5.1f}%) â†’ {scores[y_pred > y_true].sum():.1f} points")
    
    return scores.mean(), scores.sum()

# Test on our best model with actual scoring
print("=" * 60)
print("EVALUATION WITH COMPETITION SCORING")
print("=" * 60)
print()

# Original predictions (no bias)
print("Strategy 1: Raw model predictions (no adjustment)")
print("-" * 50)
score_raw, total_raw = evaluate_with_competition_score(
    le.inverse_transform(all_true), 
    le.inverse_transform(all_pred)
)
print()

# Now let's try biased predictions using predict_proba
print("Strategy 2: Bias toward higher thresholds when uncertain")
print("-" * 50)

# Re-run with probability-based adjustment
fold_scores_biased = []
all_true_biased, all_pred_biased = [], []

for fold_idx, (train_idx, test_idx) in enumerate(sgkf.split(X_top, y, groups)):
    clf = RandomForestClassifier(
        n_estimators=500, max_depth=10, min_samples_leaf=2,
        class_weight='balanced', random_state=42, n_jobs=-1
    )
    clf.fit(X_top[train_idx], y[train_idx])
    
    # Get probabilities
    proba = clf.predict_proba(X_top[test_idx])
    classes = clf.classes_
    
    # Biased prediction: if not confident, pick higher threshold
    y_pred_biased = []
    for p in proba:
        max_prob = p.max()
        max_class = classes[p.argmax()]
        
        if max_prob < 0.5:  # Not confident
            # Pick the highest class with reasonable probability
            # Weight toward higher thresholds
            weighted_probs = p.copy()
            for i, c in enumerate(classes):
                # Boost probability of higher classes
                weighted_probs[i] *= (1 + 0.3 * i)  # Higher index = higher threshold
            y_pred_biased.append(classes[weighted_probs.argmax()])
        else:
            y_pred_biased.append(max_class)
    
    y_pred_biased = np.array(y_pred_biased)
    all_true_biased.extend(y[test_idx])
    all_pred_biased.extend(y_pred_biased)

score_biased, total_biased = evaluate_with_competition_score(
    le.inverse_transform(all_true_biased),
    le.inverse_transform(all_pred_biased)
)
print()

print("Strategy 3: Always predict 1 step higher (conservative)")
print("-" * 50)
# Shift all predictions up by 1 class
all_pred_shifted = np.array(all_pred).copy()
all_pred_shifted = np.minimum(all_pred_shifted + 1, len(le.classes_) - 1)  # Cap at max class

score_shifted, total_shifted = evaluate_with_competition_score(
    le.inverse_transform(all_true),
    le.inverse_transform(all_pred_shifted)
)
print()

print("=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"  Raw predictions:      {score_raw:.4f} per sample ({total_raw:.1f} total)")
print(f"  Biased (uncertain):   {score_biased:.4f} per sample ({total_biased:.1f} total)")
print(f"  Always +1 step:       {score_shifted:.4f} per sample ({total_shifted:.1f} total)")
print()
print(f"  Best strategy: ", end="")
best = max([(score_raw, "Raw"), (score_biased, "Biased"), (score_shifted, "+1 Step")])
print(f"{best[1]} with {best[0]:.4f} per sample")

EVALUATION WITH COMPETITION SCORING

Strategy 1: Raw model predictions (no adjustment)
--------------------------------------------------
Competition Score: 99.00 / 137 = 0.7226 per sample
  Exact matches:    94 ( 68.6%) â†’ 94.0 points
  Underpredictions: 28 ( 20.4%) â†’ 0.0 points (LOST)
  Overpredictions:  15 ( 10.9%) â†’ 5.0 points

Strategy 2: Bias toward higher thresholds when uncertain
--------------------------------------------------
Competition Score: 92.00 / 137 = 0.6715 per sample
  Exact matches:    82 ( 59.9%) â†’ 82.0 points
  Underpredictions: 20 ( 14.6%) â†’ 0.0 points (LOST)
  Overpredictions:  35 ( 25.5%) â†’ 10.0 points

Strategy 3: Always predict 1 step higher (conservative)
--------------------------------------------------
Competition Score: 64.00 / 137 = 0.4672 per sample
  Exact matches:    14 ( 10.2%) â†’ 14.0 points
  Underpredictions: 14 ( 10.2%) â†’ 0.0 points (LOST)
  Overpredictions: 109 ( 79.6%) â†’ 50.0 points

SUMMARY
  Raw predictions:      0.7226 per

In [87]:
# Analyze the underpredictions - where is the model failing?
print("=" * 60)
print("ANALYZING UNDERPREDICTIONS (the 28 samples costing us points)")
print("=" * 60)
print()

y_true_orig = le.inverse_transform(all_true)
y_pred_orig = le.inverse_transform(all_pred)

under_mask = y_pred_orig < y_true_orig
over_mask = y_pred_orig > y_true_orig
exact_mask = y_pred_orig == y_true_orig

print("Underprediction breakdown (pred < true):")
print("-" * 40)
under_true = y_true_orig[under_mask]
under_pred = y_pred_orig[under_mask]

for true_val in sorted(np.unique(under_true)):
    mask = under_true == true_val
    preds = under_pred[mask]
    print(f"  True={true_val:3d}: predicted as {sorted(preds)} ({len(preds)} cases)")

print()
print("Pattern analysis:")
print("-" * 40)

# How far off are the underpredictions?
under_steps = np.log2(y_true_orig[under_mask]) - np.log2(y_pred_orig[under_mask])
print(f"  Underpredictions off by 1 step: {np.sum(under_steps == 1)}")
print(f"  Underpredictions off by 2 steps: {np.sum(under_steps == 2)}")
print(f"  Underpredictions off by 3+ steps: {np.sum(under_steps >= 3)}")

print()
print("If we could fix just the '1 step off' underpredictions:")
one_step_under = np.sum(under_steps == 1)
potential_gain = one_step_under  # Each becomes 1 point instead of 0
print(f"  Potential gain: +{potential_gain} points")
print(f"  New score: {99 + potential_gain} / 137 = {(99 + potential_gain)/137:.4f}")

# What about class-specific accuracy?
print()
print("Accuracy by true class:")
print("-" * 40)
for cls in sorted(np.unique(y_true_orig)):
    mask = y_true_orig == cls
    cls_acc = np.mean(y_pred_orig[mask] == y_true_orig[mask])
    cls_under = np.mean(y_pred_orig[mask] < y_true_orig[mask])
    cls_over = np.mean(y_pred_orig[mask] > y_true_orig[mask])
    n = np.sum(mask)
    print(f"  Class {cls:3d}: Acc={cls_acc:.1%}, Under={cls_under:.1%}, Over={cls_over:.1%} (n={n})")

ANALYZING UNDERPREDICTIONS (the 28 samples costing us points)

Underprediction breakdown (pred < true):
----------------------------------------
  True=  2: predicted as [np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1)] (6 cases)
  True=  4: predicted as [np.int64(1), np.int64(1), np.int64(1)] (3 cases)
  True=  8: predicted as [np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(2), np.int64(2), np.int64(2), np.int64(2)] (9 cases)
  True= 16: predicted as [np.int64(8), np.int64(8), np.int64(8), np.int64(8), np.int64(8), np.int64(8), np.int64(8), np.int64(8)] (8 cases)
  True= 64: predicted as [np.int64(8), np.int64(8)] (2 cases)

Pattern analysis:
----------------------------------------
  Underpredictions off by 1 step: 14
  Underpredictions off by 2 steps: 7
  Underpredictions off by 3+ steps: 7

If we could fix just the '1 step off' underpredictions:
  Potential gain: +14 points
  New score: 113 / 137 = 0.8248

Accuracy by true cla