In [2]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.feature_selection import mutual_info_regression, RFE
from sklearn.inspection import permutation_importance
from packaging import version
from sklearn import __version__ as sklearn_version
import shap
import optuna
import sklearn

# -------- Step 1: Load Data --------
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

X_train = train_df.drop(columns=['efficiency'])
y_train = train_df['efficiency']
X_test = test_df.copy()

# -------- Step 2: Feature Engineering --------
class SolarFeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        numeric_cols = ['voltage', 'current', 'irradiance', 'temperature', 'wind_speed', 
                        'soiling_ratio', 'module_temperature', 'panel_age', 'maintenance_count',
                        'humidity', 'pressure', 'cloud_coverage']
        for col in numeric_cols:
            X[col] = pd.to_numeric(X[col], errors='coerce')
        for col in numeric_cols:
            if col == 'maintenance_count':
                X[col] = X[col].fillna(0)
            else:
                X[col] = X[col].fillna(X[col].mean())
        X['power_output'] = X['voltage'] * X['current']
        X['irradiance_temp_ratio'] = X['irradiance'] / (X['temperature'] + 1)
        X['cooling_effect'] = X['wind_speed'] / (X['temperature'] + 1)
        X['efficiency_loss_due_to_soiling'] = X['soiling_ratio'] * X['irradiance']
        X['temp_diff'] = X['module_temperature'] - X['temperature']
        X['age_per_maintenance'] = X['panel_age'] / (X['maintenance_count'] + 1)
        X['humidity_pressure_interaction'] = X['humidity'] / (X['pressure'] + 1)
        X['cloud_temp_impact'] = X['cloud_coverage'] * X['module_temperature']
        X['expected_efficiency'] = (X['irradiance'] * (1 - X['soiling_ratio'])) / (X['module_temperature'] + 1)
        X['is_soiled'] = (X['soiling_ratio'] > 0.5).astype(int)
        X['is_cloudy'] = (X['cloud_coverage'] > 70).astype(int)
        X['high_panel_age'] = (X['panel_age'] > 8).astype(int)
        return X

fe = SolarFeatureEngineer()
X_train_fe = fe.fit_transform(X_train)
X_test_fe = fe.transform(X_test)

# -------- Step 3: Preprocessing Pipelines --------
numeric_cols = X_train_fe.select_dtypes(include=np.number).columns.tolist()
categorical_cols = ['string_id', 'error_code', 'installation_type']

if version.parse(sklearn_version) >= version.parse("1.2"):
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
else:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', ohe)
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, numeric_cols),
    ('cat', cat_pipeline, categorical_cols)
])

X_train_processed = full_pipeline.fit_transform(X_train_fe)
X_test_processed = full_pipeline.transform(X_test_fe)

# -------- Step 4: Polynomial Features --------
top_feats = ['irradiance', 'temperature', 'humidity', 'soiling_ratio', 'module_temperature']
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly_train = poly.fit_transform(X_train_fe[top_feats])
X_poly_test = poly.transform(X_test_fe[top_feats])

X_train_combined = np.hstack([X_train_processed, X_poly_train])
X_test_combined = np.hstack([X_test_processed, X_poly_test])

# -------- Step 5: Enhanced Feature Selection --------
shap_model = XGBRegressor(n_estimators=200, learning_rate=0.1, verbosity=0, random_state=42)
shap_model.fit(X_train_combined, y_train)

# SHAP values
explainer = shap.Explainer(shap_model)
X_sample = X_train_combined[:100]
shap_values = explainer(X_sample)
shap_importance = np.abs(shap_values.values).mean(axis=0)

# Permutation Importance
perm = permutation_importance(shap_model, X_train_combined, y_train, scoring='neg_root_mean_squared_error', n_repeats=5, n_jobs=-1)
perm_importance = np.abs(perm.importances_mean)

# Mutual Information
mi_importance = mutual_info_regression(X_train_combined, y_train, random_state=42)

# LGBM Importance
lgb = LGBMRegressor(n_estimators=200, random_state=42)
lgb.fit(X_train_combined, y_train)
lgb_importance = lgb.feature_importances_

# CatBoost Importance
cat = CatBoostRegressor(iterations=200, verbose=0, random_state=42)
cat.fit(X_train_combined, y_train)
cat_importance = cat.get_feature_importance()

# SHAP Interaction
interaction_values = explainer.shap_interaction_values(X_sample)
interaction_importance = np.abs(interaction_values).mean(axis=(0, 1))

# Average all importances
combined_importance = (
    shap_importance + perm_importance + mi_importance + lgb_importance + cat_importance + interaction_importance
) / 6

# Select top features
top_k = 60
top_indices = np.argsort(combined_importance)[-top_k:]

X_train_selected = X_train_combined[:, top_indices]
X_test_selected = X_test_combined[:, top_indices]

# Optional: Refine with RFE
rfe = RFE(Ridge(alpha=1.0), n_features_to_select=40, step=10)
rfe.fit(X_train_selected, y_train)
X_train_selected = X_train_selected[:, rfe.support_]
X_test_selected = X_test_selected[:, rfe.support_]

# -------- Step 6: Optuna Tuning --------
def objective(trial):
    alpha = trial.suggest_float("alpha", 0.1, 10.0)
    model = StackingRegressor(
        estimators=[
            ('xgb', XGBRegressor(n_estimators=500, learning_rate=0.05, verbosity=0, random_state=42)),
            ('lgb', LGBMRegressor(n_estimators=500, learning_rate=0.05, random_state=42)),
            ('cat', CatBoostRegressor(iterations=500, learning_rate=0.05, verbose=0, random_state=42)),
        ],
        final_estimator=Ridge(alpha=alpha),
        passthrough=True,
        n_jobs=-1
    )
    model.fit(X_train_selected, y_train)
    preds = model.predict(X_train_selected)
    rmse = np.sqrt(np.mean((y_train - preds) ** 2))
    return rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)
best_alpha = study.best_params['alpha']

# -------- Step 7: Final Model --------
final_model = StackingRegressor(
    estimators=[
        ('xgb', XGBRegressor(n_estimators=500, learning_rate=0.05, verbosity=0, random_state=42)),
        ('lgb', LGBMRegressor(n_estimators=500, learning_rate=0.05, random_state=42)),
        ('cat', CatBoostRegressor(iterations=500, learning_rate=0.05, verbose=0, random_state=42)),
    ],
    final_estimator=Ridge(alpha=best_alpha),
    passthrough=True,
    n_jobs=-1
)
final_model.fit(X_train_selected, y_train)

# -------- Step 8: Predict and Save --------
y_pred_test = final_model.predict(X_test_selected)
output_df = pd.DataFrame({
    'id': test_df['id'],
    'efficiency': y_pred_test
})
output_df.to_csv("final_predictions.csv", index=False)
print("✅ Ultra-optimized pipeline complete. Predictions saved to final_predictions.csv")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006535 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9221
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 50
[LightGBM] [Info] Start training from score 0.510260


[I 2025-06-25 19:13:40,586] A new study created in memory with name: no-name-f888e924-4b05-4522-94f7-d29fec539ff5
[I 2025-06-25 19:14:20,033] Trial 0 finished with value: 0.10151686121516494 and parameters: {'alpha': 3.8724086751382454}. Best is trial 0 with value: 0.10151686121516494.
[I 2025-06-25 19:14:58,270] Trial 1 finished with value: 0.10146375715685002 and parameters: {'alpha': 1.433045871631419}. Best is trial 1 with value: 0.10146375715685002.
[I 2025-06-25 19:15:35,394] Trial 2 finished with value: 0.10155181346517078 and parameters: {'alpha': 5.763086634012815}. Best is trial 1 with value: 0.10146375715685002.
[I 2025-06-25 19:16:14,671] Trial 3 finished with value: 0.10155651364396334 and parameters: {'alpha': 6.036615627937549}. Best is trial 1 with value: 0.10146375715685002.
[I 2025-06-25 19:16:51,055] Trial 4 finished with value: 0.1015808779556222 and parameters: {'alpha': 7.5340993175945385}. Best is trial 1 with value: 0.10146375715685002.
[I 2025-06-25 19:17:25,92

✅ Ultra-optimized pipeline complete. Predictions saved to final_predictions.csv
