In [88]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

**Split**

In [89]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

# read dataset
df = pd.read_csv("training_data_75.csv")

# Target for classification is the threshold value
y = df["min_threshold"].astype(int).values

# Group (so all rows of same circuit stay together)
groups = df["file"].astype(str).values

# Drop columns we dont want as features
drop_cols = [
    "min_threshold",   # target
    "file",
    "family",
    "forward_runtime", # not for classification, only regression
    "max_fidelity_achieved",
    "forward_shots",
    "forward_peak_rss_mb",
    "n_thresholds_tested",
]
drop_cols = [c for c in drop_cols if c in df.columns]

# X is equal to the whole dataset - dropped columns
X = df.drop(columns=drop_cols).copy()

# encode categorical columns (backend/precision/etc.)
X = pd.get_dummies(X, columns=[c for c in X.columns if X[c].dtype == "object" or X[c].dtype == "str"])

# ---------------------------
# Stratified split BY FILE, stratified by n_qubits bucket
# ---------------------------

# 1) Build a file-level table for stratification
file_info = df.groupby("file", as_index=False).agg(
    n_qubits=("n_qubits", "first")
)

# Bucketize n_qubits so stratification is stable (avoids classes with only 1 file)
file_info["qubit_bucket"] = pd.cut(
    file_info["n_qubits"],
    bins=[-1, 20, 60, 10**9],
    labels=["small", "medium", "large"]
)

# 2) Optional: force rare-threshold files into TRAIN (helps avoid "unseen class 256")
forced_train_files = set(df.loc[df["min_threshold"] == 256, "file"].unique())

# Split only on remaining files (pool)
pool = file_info[~file_info["file"].isin(forced_train_files)].reset_index(drop=True)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
train_f_idx, test_f_idx = next(sss.split(pool["file"], pool["qubit_bucket"]))

train_files = set(pool.loc[train_f_idx, "file"])
test_files  = set(pool.loc[test_f_idx, "file"])

# Add forced files to train
train_files |= forced_train_files

# Convert file sets -> row indices
train_idx = df.index[df["file"].isin(train_files)].to_numpy()
test_idx  = df.index[df["file"].isin(test_files)].to_numpy()

# Final arrays
x_train = X.iloc[train_idx].values.astype(np.float32)
x_test  = X.iloc[test_idx].values.astype(np.float32)
y_train = y[train_idx]
y_test  = y[test_idx]

# sanity checks
print("Shapes:", x_train.shape, x_test.shape)
print("Train classes:", sorted(np.unique(y_train)))
print("Test classes:", sorted(np.unique(y_test)))

overlap = train_files.intersection(test_files)
print("Unique files train:", len(train_files), "test:", len(test_files), "overlap:", len(overlap))


Shapes: (102, 66) (35, 66)
Train classes: [np.int64(1), np.int64(2), np.int64(4), np.int64(8), np.int64(16), np.int64(64)]
Test classes: [np.int64(1), np.int64(2), np.int64(4)]
Unique files train: 27 test: 9 overlap: 0


**Metricas**

In [90]:
import numpy as np
from sklearn.metrics import accuracy_score

def cls_metrics(y_true, y_pred, name="model"):
    y_true = np.asarray(y_true).astype(int)
    y_pred = np.asarray(y_pred).astype(int)

    acc = accuracy_score(y_true, y_pred)
    under = np.mean(y_pred < y_true)   # super importante en tu reto
    over  = np.mean(y_pred > y_true)

    #print(f"{name}")
    #print("  Accuracy:", round(acc, 4))
    #print("  Under-rate (pred < true):", round(float(under), 4))
    #print("  Over-rate  (pred > true):", round(float(over), 4))
    
    return acc


In [96]:
# =============================================================================
# FINAL CONSOLIDATED MODEL - MINIMIZING UNDERPREDICTIONS
# =============================================================================
# Goal: Minimize underprediction risk (close to 0%) even if we overpredict more
# Since we're tested on ONE scenario, a single underprediction = catastrophic 0 points
# =============================================================================

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings("ignore")

# -----------------------------------------------------------------------------
# 1. LOAD DATA AND FEATURE ENGINEERING
# -----------------------------------------------------------------------------
df = pd.read_csv("training_data_75.csv")

def engineer_features(df):
    """Create domain-specific features for quantum circuit threshold prediction."""
    X = df.copy()
    
    # Interaction features
    X['degree_x_qubits'] = X['avg_qubit_degree'] * X['n_qubits']
    X['degree_x_depth'] = X['avg_qubit_degree'] * X['crude_depth']
    X['degree_x_2q'] = X['avg_qubit_degree'] * X['n_2q_gates']
    X['entanglement_complexity'] = X['n_unique_edges'] * X['avg_qubit_degree']
    X['entanglement_per_qubit'] = X['n_unique_edges'] / (X['n_qubits'] + 1)
    
    # Ratio features
    X['cx_ratio'] = X['n_cx'] / (X['n_total_gates'] + 1)
    X['rotation_ratio'] = X['n_rotation_gates'] / (X['n_total_gates'] + 1)
    X['multi_qubit_ratio'] = (X['n_2q_gates'] + X['n_3q_gates']) / (X['n_total_gates'] + 1)
    X['gates_per_depth'] = X['n_total_gates'] / (X['crude_depth'] + 1)
    X['depth_per_qubit'] = X['crude_depth'] / (X['n_qubits'] + 1)
    X['edge_density'] = X['n_unique_edges'] / (X['n_qubits'] * (X['n_qubits'] - 1) / 2 + 1)
    X['edge_repetition_ratio'] = X['n_edge_repetitions'] / (X['n_unique_edges'] + 1)
    
    # Polynomial features
    X['degree_squared'] = X['avg_qubit_degree'] ** 2
    X['qubits_squared'] = X['n_qubits'] ** 2
    X['depth_squared'] = X['crude_depth'] ** 2
    X['log_qubits'] = np.log1p(X['n_qubits'])
    X['log_depth'] = np.log1p(X['crude_depth'])
    X['log_gates'] = np.log1p(X['n_total_gates'])
    
    # Complexity scores
    X['complexity_score'] = X['n_qubits'] * X['crude_depth'] * X['avg_qubit_degree'] / 1000
    X['entanglement_burden'] = X['n_2q_gates'] * X['avg_qubit_degree'] / (X['n_qubits'] + 1)
    X['sim_difficulty'] = X['n_qubits'] ** 1.5 * X['entanglement_pressure']
    
    # Pattern features
    X['n_patterns'] = (X['has_qft_pattern'] + X['has_iqft_pattern'] + 
                       X['has_grover_pattern'] + X['has_variational_pattern'] + X['has_ghz_pattern'])
    X['variational_complexity'] = X['has_variational_pattern'] * X['n_rotation_gates']
    
    return X

X_eng = engineer_features(df)

# Drop non-feature columns
drop_cols = ["min_threshold", "file", "family", "forward_runtime", 
             "max_fidelity_achieved", "forward_shots", "forward_peak_rss_mb", "n_thresholds_tested"]
drop_cols = [c for c in drop_cols if c in X_eng.columns]
X_eng = X_eng.drop(columns=drop_cols)

# One-hot encode categoricals
cat_cols = X_eng.select_dtypes(exclude=[np.number]).columns.tolist()
X_eng = pd.get_dummies(X_eng, columns=cat_cols)

y_raw = df["min_threshold"].astype(int).values
groups = df["file"].astype(str).values

# Encode labels
le = LabelEncoder()
y = le.fit_transform(y_raw)
threshold_classes = le.classes_  # [1, 2, 4, 8, 16, 64]

print("=" * 70)
print("FINAL MODEL: RandomForest + Top 10 Features + Conservative Prediction")
print("=" * 70)
print(f"Threshold classes: {list(threshold_classes)}")
print()

# -----------------------------------------------------------------------------
# 2. FEATURE SELECTION (Top 10 by CatBoost importance)
# -----------------------------------------------------------------------------
X_arr = X_eng.values.astype(np.float32)
X_arr = np.nan_to_num(X_arr, nan=0.0, posinf=0.0, neginf=0.0)

clf_importance = CatBoostClassifier(iterations=500, depth=6, learning_rate=0.05, random_seed=42, verbose=False)
clf_importance.fit(X_arr, y)

feature_importance = clf_importance.get_feature_importance()
importance_df = pd.DataFrame({
    'feature': X_eng.columns.tolist(),
    'importance': feature_importance
}).sort_values('importance', ascending=False)

top_k = 10
top_features = importance_df.head(top_k)['feature'].tolist()
X_top = X_eng[top_features].values.astype(np.float32)
X_top = np.nan_to_num(X_top, nan=0.0, posinf=0.0, neginf=0.0)

print(f"Top {top_k} features (by CatBoost importance):")
for i, feat in enumerate(top_features, 1):
    print(f"  {i:2d}. {feat}")
print()

# -----------------------------------------------------------------------------
# 3. COMPETITION SCORING FUNCTION
# -----------------------------------------------------------------------------
def competition_score(y_true, y_pred):
    """Calculate competition score (underpredict=0, exact=1, overpredict=partial)"""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    scores = np.zeros(len(y_true))
    scores[y_pred == y_true] = 1.0  # Exact match
    over = y_pred > y_true
    scores[over] = y_true[over] / y_pred[over]  # Partial credit
    return scores

def evaluate_strategy(y_true, y_pred, name=""):
    """Evaluate with competition scoring"""
    scores = competition_score(y_true, y_pred)
    n = len(y_true)
    exact = np.sum(y_pred == y_true)
    under = np.sum(y_pred < y_true)
    over = np.sum(y_pred > y_true)
    
    print(f"{name}")
    print(f"  Score: {scores.sum():.1f}/{n} = {scores.mean():.4f} per sample")
    print(f"  Exact: {exact:3d} ({100*exact/n:5.1f}%) â†’ {exact:.0f} pts")
    print(f"  Under: {under:3d} ({100*under/n:5.1f}%) â†’ 0 pts [RISK!]")
    print(f"  Over:  {over:3d} ({100*over/n:5.1f}%) â†’ {scores[y_pred > y_true].sum():.1f} pts")
    return scores.mean(), under

# -----------------------------------------------------------------------------
# 4. CONSERVATIVE PREDICTION FUNCTION
# -----------------------------------------------------------------------------
def conservative_predict(clf, X, threshold_classes, confidence_threshold=0.6, bump_steps=1):
    """
    Make conservative predictions that minimize underprediction risk.
    
    When model confidence is low, bump prediction up to higher threshold.
    This trades off some exact matches for avoiding catastrophic underpredictions.
    """
    proba = clf.predict_proba(X)
    classes = clf.classes_
    
    predictions = []
    for p in proba:
        max_prob = p.max()
        max_class_idx = p.argmax()
        max_class = classes[max_class_idx]
        
        if max_prob < confidence_threshold:
            # Not confident - bump up by bump_steps to be safe
            new_idx = min(max_class_idx + bump_steps, len(classes) - 1)
            predictions.append(classes[new_idx])
        else:
            predictions.append(max_class)
    
    return np.array(predictions)

# -----------------------------------------------------------------------------
# 5. K-FOLD EVALUATION WITH MULTIPLE STRATEGIES
# -----------------------------------------------------------------------------
min_class = min(np.bincount(y)[np.bincount(y) > 0])
n_splits = min(3, min_class)
sgkf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)

print(f"Evaluation: {n_splits}-fold StratifiedGroupKFold (grouped by file)")
print()

# Track results for different strategies
strategies = {
    'raw': {'true': [], 'pred': []},
    'conservative_60': {'true': [], 'pred': []},
    'conservative_50': {'true': [], 'pred': []},
    'conservative_40': {'true': [], 'pred': []},
}

for fold_idx, (train_idx, test_idx) in enumerate(sgkf.split(X_top, y, groups)):
    clf = RandomForestClassifier(
        n_estimators=500, max_depth=10, min_samples_leaf=2,
        class_weight='balanced', random_state=42, n_jobs=-1
    )
    clf.fit(X_top[train_idx], y[train_idx])
    
    y_test = y[test_idx]
    
    # Raw prediction
    y_raw_pred = clf.predict(X_top[test_idx])
    strategies['raw']['true'].extend(y_test)
    strategies['raw']['pred'].extend(y_raw_pred)
    
    # Conservative predictions with different confidence thresholds
    for thresh, key in [(0.6, 'conservative_60'), (0.5, 'conservative_50'), (0.4, 'conservative_40')]:
        y_cons = conservative_predict(clf, X_top[test_idx], threshold_classes, 
                                       confidence_threshold=thresh, bump_steps=1)
        strategies[key]['true'].extend(y_test)
        strategies[key]['pred'].extend(y_cons)


print("=" * 70)
print("STRATEGY COMPARISON (Minimizing Underprediction Risk)")
print("=" * 70)
print()

results = []
for name, data in strategies.items():
    y_true_orig = le.inverse_transform(data['true'])
    y_pred_orig = le.inverse_transform(data['pred'])
    score, underpred = evaluate_strategy(y_true_orig, y_pred_orig, name)
    results.append((name, score, underpred))
    print()

# Summary table
print("=" * 70)
print("SUMMARY - Sorted by Underprediction Risk (lowest first)")
print("=" * 70)
print(f"{'Strategy':<20} {'Score':>8} {'Underpred':>10} {'Risk Level':>15}")
print("-" * 53)
for name, score, underpred in sorted(results, key=lambda x: x[2]):
    risk = "ðŸŸ¢ LOW" if underpred <= 5 else ("ðŸŸ¡ MEDIUM" if underpred <= 15 else "ðŸ”´ HIGH")
    print(f"{name:<20} {score:>8.4f} {underpred:>10d} {risk:>15}")

# Find best strategy with minimal underpredictions
print()
print("=" * 70)
print("RECOMMENDATION")
print("=" * 70)

# Get strategy with lowest underprediction count that still has decent score
safe_strategies = [(n, s, u) for n, s, u in results if u <= 10]
if safe_strategies:
    best = max(safe_strategies, key=lambda x: x[1])
    print(f"Best safe strategy: {best[0]}")
    print(f"  Competition Score: {best[1]:.4f} per sample")
    print(f"  Underpredictions: {best[2]} (goal: as close to 0 as possible)")
    print()
    print("For a SINGLE test case, this minimizes the chance of getting 0 points.")
else:
    print("No strategy achieved sufficiently low underprediction risk.")
    print("Consider using 'always_plus_2' for maximum safety.")

# -----------------------------------------------------------------------------
# 6. FINAL MODEL TRAINING (for production use)
# -----------------------------------------------------------------------------
print()
print("=" * 70)
print("FINAL TRAINED MODEL")
print("=" * 70)

# Train on ALL data for final model
final_clf = RandomForestClassifier(
    n_estimators=500, max_depth=10, min_samples_leaf=2,
    class_weight='balanced', random_state=42, n_jobs=-1
)
final_clf.fit(X_top, y)

print("Model trained on full dataset.")
print(f"Classes: {list(threshold_classes)}")
print()
print("Usage for prediction:")
print("  y_pred = conservative_predict(final_clf, X_new, threshold_classes, ")
print("                                confidence_threshold=0.5, bump_steps=1)")
print()
print("Or for maximum safety (minimize underprediction to near 0):")
print("  y_pred_safe = np.minimum(final_clf.predict(X_new) + 1, 5)  # Bump everything up")


FINAL MODEL: RandomForest + Top 10 Features + Conservative Prediction
Threshold classes: [np.int64(1), np.int64(2), np.int64(4), np.int64(8), np.int64(16), np.int64(64)]

Top 10 features (by CatBoost importance):
   1. n_h
   2. avg_gate_span
   3. cx_ratio
   4. n_cx
   5. n_u2
   6. crude_depth
   7. std_gate_span
   8. ratio_1q_gates
   9. 1q_gates_per_qubit
  10. depth_squared

Evaluation: 2-fold StratifiedGroupKFold (grouped by file)

STRATEGY COMPARISON (Minimizing Underprediction Risk)

raw
  Score: 99.0/137 = 0.7226 per sample
  Exact:  94 ( 68.6%) â†’ 94 pts
  Under:  28 ( 20.4%) â†’ 0 pts [RISK!]
  Over:   15 ( 10.9%) â†’ 5.0 pts

conservative_60
  Score: 96.0/137 = 0.7007 per sample
  Exact:  78 ( 56.9%) â†’ 78 pts
  Under:  14 ( 10.2%) â†’ 0 pts [RISK!]
  Over:   45 ( 32.8%) â†’ 18.0 pts

conservative_50
  Score: 94.0/137 = 0.6861 per sample
  Exact:  82 ( 59.9%) â†’ 82 pts
  Under:  24 ( 17.5%) â†’ 0 pts [RISK!]
  Over:   31 ( 22.6%) â†’ 12.0 pts

conservative_40
  Score: 