In [None]:
# 📦 Imports
!pip install optuna catboost
!pip install category_encoders
import pandas as pd
import numpy as np
import optuna
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error
from category_encoders import TargetEncoder
import warnings
warnings.filterwarnings('ignore')



In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [None]:

# 📊 Columns
num_cols = ['temperature', 'irradiance', 'humidity', 'panel_age', 'maintenance_count',
            'soiling_ratio', 'voltage', 'current', 'module_temperature',
            'cloud_coverage', 'wind_speed', 'pressure']
cat_cols = ['string_id', 'error_code', 'installation_type']


In [None]:
for col in num_cols:
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
    test_df[col] = pd.to_numeric(test_df[col], errors='coerce')
train_df[num_cols] = train_df[num_cols].fillna(train_df[num_cols].mean())
test_df[num_cols] = test_df[num_cols].fillna(test_df[num_cols].mean())
for col in cat_cols:
    train_df[col] = train_df[col].astype(str).replace("nan", "missing")
    test_df[col] = test_df[col].astype(str).replace("nan", "missing")


In [None]:
# 🔍 Outlier Removal
q_low = train_df['efficiency'].quantile(0.01)
q_high = train_df['efficiency'].quantile(0.99)
train_df = train_df[(train_df['efficiency'] > q_low) & (train_df['efficiency'] < q_high)]

In [None]:
def create_features(df):
    df = df.copy()
    df['power_output'] = df['voltage'] * df['current']
    df['degradation_rate'] = df['panel_age'] * df['soiling_ratio']
    df['temp_diff'] = df['module_temperature'] - df['temperature']
    df['irradiance_adjusted'] = df['irradiance'] * (1 - df['cloud_coverage'] / 100)
    df['temp_irradiance_interaction'] = df['module_temperature'] * df['irradiance']
    df['age_soiling_temp'] = df['panel_age'] * df['soiling_ratio'] * df['module_temperature']
    df['wind_cooling_effect'] = df['wind_speed'] * (df['module_temperature'] - df['temperature'])
    df['humidity_pressure'] = df['humidity'] * df['pressure']
    df['humidity_x_irradiance'] = df['humidity'] * df['irradiance']
    df['wind_x_cloud'] = df['wind_speed'] * df['cloud_coverage']
    return df

train_df = create_features(train_df)
test_df = create_features(test_df)

In [None]:
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
poly_feats = poly.fit_transform(train_df[num_cols])
poly_feats_test = poly.transform(test_df[num_cols])
poly_cols = [f"poly_{i}" for i in range(poly_feats.shape[1])]
poly_df = pd.DataFrame(poly_feats, columns=poly_cols)
poly_df_test = pd.DataFrame(poly_feats_test, columns=poly_cols)

train_df = pd.concat([train_df.reset_index(drop=True), poly_df], axis=1)
test_df = pd.concat([test_df.reset_index(drop=True), poly_df_test], axis=1)

In [None]:
# 🔠 Target Encoding
target_enc = TargetEncoder(cols=cat_cols)
train_df[cat_cols] = target_enc.fit_transform(train_df[cat_cols], train_df['efficiency'])
test_df[cat_cols] = target_enc.transform(test_df[cat_cols])


In [None]:
# 📊 Scaling
scaler = StandardScaler()
scaled_cols = num_cols + poly_cols
train_df[scaled_cols] = scaler.fit_transform(train_df[scaled_cols])
test_df[scaled_cols] = scaler.transform(test_df[scaled_cols])


In [None]:
# 🧪 Prepare Data
X = train_df.drop(['id', 'efficiency'], axis=1)
y = np.log1p(train_df['efficiency'])  # Log transformation of target
X_test = test_df.drop(['id'], axis=1)

In [None]:
# 🎯 Custom Score
def custom_score(y_true, y_pred):
    return 100 * (1 - np.sqrt(mean_squared_error(np.expm1(y_true), np.expm1(y_pred))))

In [None]:
def objective_cat(trial):
    bootstrap_type = trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"])

    params = {
        "iterations": 1000,
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "bootstrap_type": bootstrap_type,
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 20),
        "random_strength": trial.suggest_float("random_strength", 0.1, 10),
        "early_stopping_rounds": 50,
        "verbose": 0,
        "loss_function": "RMSE",
        "random_seed": 42
    }

    # Conditional parameters based on bootstrap type
    if bootstrap_type == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 1)
    elif bootstrap_type in ["Bernoulli", "MVS"]:
        params["subsample"] = trial.suggest_float("subsample", 0.5, 1.0)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = []

    for train_idx, val_idx in kf.split(X):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = CatBoostRegressor(**params)
        model.fit(X_tr, y_tr, eval_set=(X_val, y_val), use_best_model=True)

        preds = model.predict(X_val)
        scores.append(custom_score(y_val, preds))  # Or use RMSE directly

    return -np.mean(scores)  # Minimize the negative score

In [None]:
study_cat = optuna.create_study(direction="minimize")
study_cat.optimize(objective_cat, n_trials=30)
cat_best_params = study_cat.best_params

[I 2025-06-08 16:35:57,932] A new study created in memory with name: no-name-64efe2d8-c23b-4a61-bd7e-9e8c8ad71052
[I 2025-06-08 16:51:29,088] Trial 0 finished with value: -95.57473672294398 and parameters: {'bootstrap_type': 'Bayesian', 'depth': 9, 'learning_rate': 0.016578018799412876, 'l2_leaf_reg': 6.341991612240737, 'min_data_in_leaf': 5, 'random_strength': 8.02345632920249, 'bagging_temperature': 0.36244005965568216}. Best is trial 0 with value: -95.57473672294398.
[I 2025-06-08 16:52:41,659] Trial 1 finished with value: -95.58133122264924 and parameters: {'bootstrap_type': 'Bayesian', 'depth': 4, 'learning_rate': 0.029629214515242693, 'l2_leaf_reg': 2.478235111375282, 'min_data_in_leaf': 9, 'random_strength': 4.387682534625672, 'bagging_temperature': 0.810786370441289}. Best is trial 1 with value: -95.58133122264924.
[I 2025-06-08 16:54:28,809] Trial 2 finished with value: -95.6006446032985 and parameters: {'bootstrap_type': 'MVS', 'depth': 5, 'learning_rate': 0.01529143974981189

In [None]:
final_model = CatBoostRegressor(**cat_best_params,
                                iterations=1000,
                                early_stopping_rounds=50,
                                loss_function='RMSE',
                                random_seed=42,
                                verbose=0)
final_model.fit(X, y)


<catboost.core.CatBoostRegressor at 0x7e9fe3a03010>

In [None]:
# 🧪 Prediction
final_preds = final_model.predict(X_test)
final_preds = np.expm1(final_preds)  # Revert log1p transformation

In [None]:
submission = pd.DataFrame({
    'id': test_df['id'],
    'efficiency': final_preds
})
submission.to_csv("catboost_submission.csv", index=False)

print("✅ Submission file 'catboost_submission.csv' created.")

✅ Submission file 'catboost_submission.csv' created.


In [None]:
from google.colab import files
files.download("catboost_submission.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
final_preds = final_model.predict(X_test_processed)



In [None]:
# After preprocessing test set as X_test_processed
ensemble_test_pred = final_model.predict(X_test_processed)

# Then create submission
test = pd.read_csv("test.csv")
submission_df = pd.DataFrame({
    "id": test["id"],
    "efficiency": ensemble_test_pred
})
submission_filename = "ensemble_submission.csv"
submission_df.to_csv(submission_filename, index=False)

from google.colab import files
files.download(submission_filename)



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# After preprocessing test set as X_test_processed
ensemble_test_pred = final_model.predict(X_test_processed)

# Then create submission
test = pd.read_csv("test.csv")
submission_df = pd.DataFrame({
    "id": test["id"],
    "efficiency": ensemble_test_pred
})
submission_filename = "ensemble_submission.csv"
submission_df.to_csv(submission_filename, index=False)

from google.colab import files
files.download(submission_filename)



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>