In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the datasets (replace with your local file paths)
X_train = pd.read_parquet('X_train.parquet')
X_test = pd.read_parquet('X_test_reduced.parquet')
y_train = pd.read_parquet('y_train.parquet')['break'].values  # Assuming 'break' is the column name

# Function to extract features from time series
def extract_features(df, boundary_point):
    features = []
    for id_val in df.index.get_level_values('id').unique():
        series = df.xs(id_val, level='id')
        boundary = series[series['period'] == 1].index.min()
        if pd.isna(boundary):
            boundary = len(series) // 2  # Default split if no boundary
        
        before = series[series.index < boundary]['value']
        after = series[series.index >= boundary]['value']
        
        # Statistical features
        feat = {
            'mean_before': before.mean(),
            'var_before': before.var(),
            'skew_before': before.skew(),
            'mean_after': after.mean(),
            'var_after': after.var(),
            'skew_after': after.skew(),
            'mean_diff': after.mean() - before.mean(),
            'var_diff': after.var() - before.var(),
            'length_diff': len(after) - len(before)
        }
        features.append(feat)
    return pd.DataFrame(features, index=df.index.get_level_values('id').unique())

# Extract features
X_train_features = extract_features(X_train, boundary_point=None)  # Boundary from 'period' column
X_test_features = extract_features(X_test, boundary_point=None)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_features)
X_test_scaled = scaler.transform(X_test_features)

# Train the model
model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train_scaled, y_train)

# Predict probabilities for training data (for validation)
train_probs = model.predict_proba(X_train_scaled)[:, 1]
train_roc_auc = roc_auc_score(y_train, train_probs)
print(f'Training ROC AUC: {train_roc_auc:.4f}')

# Predict probabilities for test data
test_probs = model.predict_proba(X_test_scaled)[:, 1]

# Function to prepare submission (as per competition requirements)
def infer(X_test):
    X_test_features = extract_features(X_test, boundary_point=None)
    X_test_scaled = scaler.transform(X_test_features)
    return model.predict_proba(X_test_scaled)[:, 1]

# Save predictions for submission
submission = pd.DataFrame({
    'id': X_test.index.get_level_values('id').unique(),
    'prediction': test_probs
})
submission.to_csv('submission.csv', index=False)

print('Submission file created as submission.csv')

# Optional: Validate on a holdout set
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_scaled, y_train, test_size=0.2, random_state=42
)
model_val = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
model_val.fit(X_train_split, y_train_split)
val_probs = model_val.predict_proba(X_val_split)[:, 1]
val_roc_auc = roc_auc_score(y_val_split, val_probs)
print(f'Validation ROC AUC: {val_roc_auc:.4f}')
   