In [1]:
from google.colab import files
uploaded = files.upload()


Saving test.csv to test.csv
Saving train.csv to train.csv


In [2]:
!pip install -q catboost shap

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m99.2/99.2 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [12]:
import pandas as pd
import numpy as np
import shap
from catboost import CatBoostRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

# === Load Data ===
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

features = df_train.columns[:55].tolist()
targets = df_train.columns[55:].tolist()

X_train_raw = df_train[features]
X_test_raw = df_test[features].iloc[:500]
Y_train = df_train[targets]

# === Impute + Scale ===
imp = SimpleImputer()
X_train = imp.fit_transform(X_train_raw)
X_test = imp.transform(X_test_raw)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

df_train_scaled = pd.DataFrame(X_train, columns=features)
df_test_scaled = pd.DataFrame(X_test, columns=features)

# === Tuned Parameters per BlendProperty ===
param_grid = {
    'BlendProperty1': {'depth': 6, 'learning_rate': 0.05, 'iterations': 2700, 'l2_leaf_reg': 4},
    'BlendProperty2': {'depth': 5, 'learning_rate': 0.06, 'iterations': 2600, 'l2_leaf_reg': 3},
    'BlendProperty3': {'depth': 6, 'learning_rate': 0.045, 'iterations': 3000, 'l2_leaf_reg': 5},
    'BlendProperty4': {'depth': 4, 'learning_rate': 0.05, 'iterations': 2500, 'l2_leaf_reg': 4},
    'BlendProperty5': {'depth': 7, 'learning_rate': 0.035, 'iterations': 2800, 'l2_leaf_reg': 6},
    'BlendProperty6': {'depth': 6, 'learning_rate': 0.04, 'iterations': 2900, 'l2_leaf_reg': 5},
    'BlendProperty7': {'depth': 5, 'learning_rate': 0.06, 'iterations': 2500, 'l2_leaf_reg': 3},
    'BlendProperty8': {'depth': 6, 'learning_rate': 0.045, 'iterations': 2700, 'l2_leaf_reg': 4},
    'BlendProperty9': {'depth': 5, 'learning_rate': 0.05, 'iterations': 2600, 'l2_leaf_reg': 4},
    'BlendProperty10': {'depth': 6, 'learning_rate': 0.04, 'iterations': 3000, 'l2_leaf_reg': 5},
}


final_preds = []
metrics_log = []

for col in targets:
    print(f"\n Processing {col}")
    y_col = Y_train[col].values
    p = param_grid[col]

       # === Feature Selection using a base CatBoost model ===
    base_model = CatBoostRegressor(
        iterations=p['iterations'],
        learning_rate=p['learning_rate'],
        depth=p['depth'],
        l2_leaf_reg=p['l2_leaf_reg'],
        bagging_temperature=0.5,
        random_seed=42,
        loss_function='RMSE',
        od_type="Iter",
        od_wait=50,
        verbose=0
    )
    base_model.fit(X_train, y_col)
    importance_df = base_model.get_feature_importance(prettified=True)
    important_feature_ids = importance_df[importance_df['Importances'] > 0.1]['Feature Id'].astype(int).tolist()
    important_feature_names = [features[i] for i in important_feature_ids]

    if not important_feature_names:
        print(f"‚ö† No important features for {col}. Using all features.")
        important_feature_names = features

    X_train_sub = df_train_scaled[important_feature_names].values
    X_test_sub = df_test_scaled[important_feature_names].values

    # === Train 3 CatBoost models
    cb_preds_test_all = []
    cb_preds_train_all = []

    for seed_offset in [0, 11, 27]:  # 3 different seeds/models
        model = CatBoostRegressor(
            iterations=p['iterations'],
            learning_rate=p['learning_rate'],
            depth=p['depth'],
            l2_leaf_reg=p['l2_leaf_reg'],
            bagging_temperature=0.5,
            random_seed=42 + seed_offset,
            loss_function='RMSE',
            od_type="Iter",
            od_wait=50,
            verbose=0
        )
        model.fit(X_train_sub, y_col)
        cb_preds_test_all.append(model.predict(X_test_sub))
        cb_preds_train_all.append(model.predict(X_train_sub))

    # === Train ANN on same features
    ann = Sequential([
        Input(shape=(X_train_sub.shape[1],)),
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),
        Dense(30, activation='relu'),
        Dense(1)
    ])
    ann.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    es = EarlyStopping(patience=25, restore_best_weights=True, verbose=0)

    ann.fit(X_train_sub, y_col, validation_split=0.1, epochs=400, batch_size=64, verbose=0, callbacks=[es])

    ann_pred_test = ann.predict(X_test_sub).flatten()
    ann_pred_train = ann.predict(X_train_sub).flatten()

    # === Final Ensemble
    cb_mean_test = np.mean(cb_preds_test_all, axis=0)
    cb_mean_train = np.mean(cb_preds_train_all, axis=0)

    final_pred_test = 0.8 * cb_mean_test + 0.2 * ann_pred_test
    final_pred_train = 0.8 * cb_mean_train + 0.2 * ann_pred_train

    final_preds.append(final_pred_test)

    # === Evaluation
    r2 = r2_score(y_col, final_pred_train)
    mape = mean_absolute_percentage_error(y_col, final_pred_train)
    metrics_log.append((col, r2, mape))
    print(f"{col} R¬≤: {r2:.4f}, MAPE: {mape:.4f}")

# === Save Final Predictions ===
pred_df = pd.DataFrame(np.array(final_preds).T, columns=targets)
pred_df.insert(0, "ID", range(1, 501))
pred_df.to_csv("catboost_ann_ensemble_predictions.csv", index=False)
print("\nüìÅ Saved: catboost_ann_ensemble_predictions.csv")

# === Summary ===
print("\n Final Scores:")
for name, r2, mape in metrics_log:
    print(f"{name} - R¬≤: {r2:.4f}, MAPE: {mape:.4f}")


Processing BlendProperty1
[1m16/16[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 7ms/step
[1m63/63[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 2ms/step
BlendProperty1  R¬≤: 0.9998, MAPE: 0.0650

Processing BlendProperty2
[1m16/16[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 7ms/step
[1m63/63[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 1ms/step
BlendProperty2  R¬≤: 0.9998, MAPE: 0.0660

Processing BlendProperty3
[1m16/16[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 8ms/step
[1m63/63[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 2ms/step
BlendProperty3 R¬≤: 0.9996, MAPE: 0.1298

 Processing BlendProperty4
[1m16/16[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m