In [None]:
import numpy as np
import xgboost as xgb
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score
import joblib
import json

print("=" * 80)
print("üéØ ENSEMBLE & SUBMISSION GENERATION")
print("=" * 80)

# Load data
X_train = np.load("../data/gold/X_train_esm2.npy")
X_val = np.load("../data/gold/X_train_esm2.npy")  # Ten sam plik, ale inne indeksy
y_train = np.load("../data/gold/y_train_labels.npy")
y_val = np.load("../data/gold/y_val_labels.npy")
X_test = np.load("../data/gold/X_test_esm2.npy")
y_pred_nn = np.load("../data/gold/y_pred_nn.npy")

train_ids_labels = np.load("../data/gold/train_protein_ids.npy", allow_pickle=True)
val_ids_labels = np.load("../data/gold/val_protein_ids.npy", allow_pickle=True)
all_ids_emb = np.load("../data/gold/train_ids.npy", allow_pickle=True)
test_ids = np.load("../data/gold/test_protein_ids.npy", allow_pickle=True)

# Mapowanie
train_idx_emb = [np.where(all_ids_emb == pid)[0][0] for pid in train_ids_labels]
val_idx_emb = [np.where(all_ids_emb == pid)[0][0] for pid in val_ids_labels]
X_train = X_train[train_idx_emb]
X_val = X_val[val_idx_emb]

print(f"\nüìä Dataset sizes:")
print(f"  Train: X={X_train.shape}, y={y_train.shape}")
print(f"  Val: X={X_val.shape}, y={y_val.shape}")
print(f"  Test: {X_test.shape}")

# ========================================
# CZƒò≈öƒÜ 1: Trenowanie XGBoost
# ========================================
print("\n" + "=" * 80)
print("üå≤ Trenowanie XGBoost...")
print("=" * 80)

clf = MultiOutputClassifier(xgb.XGBClassifier(
    tree_method='hist',      # üîß ZMIENIONE: 'hist' zamiast 'gpu_hist'
    device='cuda',           # GPU jest tu
    n_estimators=100, 
    max_depth=5, 
    learning_rate=0.1,
    random_state=42
))

clf.fit(X_train, y_train)
print("‚úÖ XGBoost wytrenowany")

# Predykcja na validation
print("\nüß† Predykcja XGBoost na validation...")
y_val_pred_xgb = np.array([p[:, 1] for p in clf.predict_proba(X_val)]).T

# Predykcja na test
print("üß† Predykcja XGBoost na test...")
y_test_pred_xgb = np.array([p[:, 1] for p in clf.predict_proba(X_test)]).T

print(f"‚úÖ XGBoost predictions: val={y_val_pred_xgb.shape}, test={y_test_pred_xgb.shape}")

# ========================================
# CZƒò≈öƒÜ 2: NN Predictions na Validation
# ========================================
print("\n" + "=" * 80)
print("üß† Wczytywanie NN predictions dla validation...")
print("=" * 80)

import torch
import torch.nn as nn

class ProteinClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(320, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(512, 1024), nn.BatchNorm1d(1024), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(1024, 1500)
        )
    def forward(self, x): 
        return self.net(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ProteinClassifier().to(device)
model.load_state_dict(torch.load("../models/protein_nn.pth"))
model.eval()

with torch.no_grad():
    y_val_pred_nn = torch.sigmoid(model(torch.FloatTensor(X_val).to(device))).cpu().numpy()

y_test_pred_nn = y_pred_nn  # Already computed

print(f"‚úÖ NN predictions: val={y_val_pred_nn.shape}, test={y_test_pred_nn.shape}")

# ========================================
# CZƒò≈öƒÜ 3: Optymalizacja wag ensemble
# ========================================
print("\n" + "=" * 80)
print("‚öñÔ∏è Optymalizacja wag ensemble na validation...")
print("=" * 80)

# Test r√≥≈ºne kombinacje wag
weight_combinations = [
    (0.5, 0.5),
    (0.6, 0.4),
    (0.7, 0.3),
    (0.8, 0.2),
    (0.9, 0.1),
    (0.4, 0.6),
    (0.3, 0.7),
]

# Test thresholdy
thresholds = [0.01, 0.03, 0.05, 0.08, 0.1, 0.15, 0.2]

best_f1 = 0
best_weights = (0.7, 0.3)
best_threshold = 0.05

results = []

for w_nn, w_xgb in weight_combinations:
    y_val_ensemble = (w_nn * y_val_pred_nn) + (w_xgb * y_val_pred_xgb)
    
    for threshold in thresholds:
        y_pred_binary = (y_val_ensemble > threshold).astype(int)
        f1 = f1_score(y_val, y_pred_binary, average='samples', zero_division=0)
        
        results.append({
            'w_nn': w_nn,
            'w_xgb': w_xgb,
            'threshold': threshold,
            'f1': f1
        })
        
        if f1 > best_f1:
            best_f1 = f1
            best_weights = (w_nn, w_xgb)
            best_threshold = threshold

# Print top 10
print("\nüèÜ Top 10 konfiguracji:")
sorted_results = sorted(results, key=lambda x: x['f1'], reverse=True)
for i, r in enumerate(sorted_results[:10], 1):
    print(f"  {i:2d}. NN={r['w_nn']:.1f}, XGB={r['w_xgb']:.1f}, "
          f"T={r['threshold']:.2f} ‚Üí F1={r['f1']:.4f}")

print(f"\n‚úÖ BEST CONFIG:")
print(f"  Weights: NN={best_weights[0]:.1f}, XGB={best_weights[1]:.1f}")
print(f"  Threshold: {best_threshold}")
print(f"  Val F1: {best_f1:.4f}")

# Save config
config = {
    'w_nn': best_weights[0],
    'w_xgb': best_weights[1],
    'threshold': best_threshold,
    'val_f1': best_f1
}

with open('../models/ensemble_config.json', 'w') as f:
    json.dump(config, f, indent=2)

# ========================================
# CZƒò≈öƒÜ 4: Final Ensemble na Test Set
# ========================================
print("\n" + "=" * 80)
print("üéØ Generowanie finalnego ensemble dla test set...")
print("=" * 80)

y_test_final = (best_weights[0] * y_test_pred_nn) + (best_weights[1] * y_test_pred_xgb)

print(f"‚úÖ Final predictions: {y_test_final.shape}")

# ========================================
# CZƒò≈öƒÜ 5: Zapisywanie Submission
# ========================================
print("\n" + "=" * 80)
print("üíæ Zapisywanie submission...")
print("=" * 80)

# Sprawd≈∫ statystyki przed zapisem
print(f"\nüìä Statystyki predykcji:")
print(f"  Min score: {y_test_final.min():.4f}")
print(f"  Max score: {y_test_final.max():.4f}")
print(f"  Mean score: {y_test_final.mean():.4f}")
print(f"  Scores > {best_threshold}: {(y_test_final > best_threshold).sum():,}")

# Zapisz submission
submission_lines = []
total_predictions = 0

for i, prot_id in enumerate(test_ids):
    row = y_test_final[i]
    indices = np.where(row > best_threshold)[0]
    
    # Sort by score (descending)
    sorted_indices = indices[np.argsort(-row[indices])]
    
    # Limit to top 1500 per protein (competition requirement)
    sorted_indices = sorted_indices[:1500]
    
    for idx in sorted_indices:
        submission_lines.append(f"{prot_id}\t{top_terms[idx]}\t{row[idx]:.3f}\n")
        total_predictions += 1

# Write submission
with open("submission_final.tsv", "w") as f:
    f.writelines(submission_lines)

print(f"\n‚úÖ Submission zapisany: submission_final.tsv")
print(f"  Total lines: {total_predictions:,}")
print(f"  Avg predictions per protein: {total_predictions / len(test_ids):.1f}")

# ========================================
# CZƒò≈öƒÜ 6: Validation Summary
# ========================================
print("\n" + "=" * 80)
print("üìà PODSUMOWANIE")
print("=" * 80)

print(f"\nüéØ Best Configuration:")
print(f"  NN Weight: {best_weights[0]}")
print(f"  XGBoost Weight: {best_weights[1]}")
print(f"  Threshold: {best_threshold}")
print(f"  Validation F1: {best_f1:.4f}")

print(f"\nüìÅ Pliki wygenerowane:")
print(f"  submission_final.tsv - Gotowy do uploadu na Kaggle!")
print(f"  ../models/ensemble_config.json - Najlepsza konfiguracja")

print("\n" + "=" * 80)
print("‚úÖ WSZYSTKO GOTOWE! MO≈ªESZ SUBMITOWAƒÜ!")
print("=" * 80)

üéØ ENSEMBLE & SUBMISSION GENERATION

üìä Dataset sizes:
  Train: X=(70043, 320), y=(70043, 1500)
  Val: X=(12361, 320), y=(12361, 1500)
  Test: (224309, 320)

üå≤ Trenowanie XGBoost...


XGBoostError: Invalid Input: 'gpu_hist', valid values are: {'approx', 'auto', 'exact', 'hist'}