In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error

# 1. Load Data
file_path = '/kaggle/input/catechol-benchmark-hackathon/catechol_full_data_yields.csv'
df = pd.read_csv(file_path)

# 2. Define Columns
feature_cols = [
    'SOLVENT A NAME', 'SOLVENT B NAME', 'SolventB%', 
    'Residence Time', 'Temperature', 'SOLVENT A Ratio', 'SOLVENT B Ratio'
]
target_cols = ['SM', 'Product 2', 'Product 3']

# 3. Data Processing and Cleaning (Resolving ValueError)
def final_preprocess(data, features, targets):
    X = data[features].copy()
    y = data[targets].copy()
    
    # Convert solvents to numerical values
    le = LabelEncoder()
    for col in ['SOLVENT A NAME', 'SOLVENT B NAME']:
        X[col] = le.fit_transform(X[col].astype(str))
    
    # Clean Ratio columns and convert to numeric (removing non-numeric text/symbols)
    for col in ['SOLVENT A Ratio', 'SOLVENT B Ratio']:
        # Convert to string, remove non-numeric characters, then convert to numeric
        X[col] = pd.to_numeric(X[col].astype(str).str.replace(r'[^0-9.]', '', regex=True), errors='coerce')
        # Fill missing values with 0
        X[col] = X[col].fillna(0)
    
    # Feature Engineering to boost accuracy
    X['Reaction_Energy'] = X['Temperature'] * X['Residence Time']
    X['B_Conc_Temp'] = X['SolventB%'] * X['Temperature']
    
    return X, y

X_final, y_final = final_preprocess(df, feature_cols, target_cols)

# 4. Model Setup with Categorical Support
model = MultiOutputRegressor(XGBRegressor(
    n_estimators=1500,
    learning_rate=0.015,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.9,
    random_state=42,
    enable_categorical=True, # Enable categorical data support
    tree_method='hist'
))

# 5. Training and K-Fold Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
submission_rows = []
cv_scores = []

print("ðŸš€ Training with corrected numerical features...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X_final)):
    X_train, X_val = X_final.iloc[train_idx], X_final.iloc[val_idx]
    y_train, y_val = y_final.iloc[train_idx], y_final.iloc[val_idx]
    
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    
    # Post-processing: Chemical constraints (Clip and Normalize)
    # Ensure outputs are between 0 and 1
    preds = np.clip(preds, 0, 1)
    
    # Normalize rows so the sum of products equals 1 (or 100%)
    row_sums = preds.sum(axis=1)[:, np.newaxis]
    row_sums[row_sums == 0] = 1 # Avoid division by zero
    preds = preds / row_sums
    
    score = mean_absolute_error(y_val, preds)
    cv_scores.append(score)
    print(f"Fold {fold} MAE: {score:.6f}")
    
    for i, idx in enumerate(val_idx):
        submission_rows.append({
            'fold': fold,
            'row': idx,
            'target_1': preds[i][0],
            'target_2': preds[i][1],
            'target_3': preds[i][2]
        })

print(f"âœ¨ Success! Final Corrected MAE: {np.mean(cv_scores):.6f}")
# ... (Previous code remains the same until the end)

# 6. Formatting for Submission
sub_df = pd.DataFrame(submission_rows).sort_values(['row'])

# Assuming the competition expects an 'id' column (e.g., "row_0", "row_1" or just 0, 1, 2)
# Here we use the original index 'row' as the 'id'
sub_df['id'] = sub_df['row'] 

# Create the final dataframe with the exact column names expected by the competition
# Update 'target_1', 'target_2', 'target_3' to 'SM', 'Product 2', 'Product 3' if needed
final_submission = pd.DataFrame({
    'id': sub_df['id'],
    'SM': sub_df['target_1'],
    'Product 2': sub_df['target_2'],
    'Product 3': sub_df['target_3']
})

# Save the file
final_submission.to_csv('submission.csv', index=False)
print("âœ… Submission file saved successfully with 'id' column!")
# Save the submission file
sub_df = pd.DataFrame(submission_rows).sort_values(['fold', 'row'])
sub_df.to_csv('submission.csv', index=False)