In [None]:
import pandas as pd
import numpy as np
import optuna
import lightgbm as lgb
import catboost as cb
import joblib
import os
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, PowerTransformer, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_percentage_error
import warnings
warnings.filterwarnings('ignore')

In [None]:
# --- 1. SETUP AND DATA LOADING ---
TARGET_TO_SOLVE = 'BlendProperty4'
MODEL_SAVE_DIR = f"saved_models_{TARGET_TO_SOLVE}"

data = pd.read_csv("train.csv")

# (The outlier removal and basic feature engineering code remains here)
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower_bound, upper_bound)
    return df
for column in data.columns: data = remove_outliers(data, column)

print("Creating base features...")
statistical_features = pd.DataFrame(index=data.index)
for j in range(1, 11):
    prop_cols = [f"Component{i}_Property{j}" for i in range(1, 6)]
    statistical_features[f'Prop{j}_mean'] = data[prop_cols].mean(axis=1)
    statistical_features[f'Prop{j}_std'] = data[prop_cols].std(axis=1)
    
weighted_data = {}
base_features_cols = [col for col in data.columns if 'Property' in col or 'fraction' in col]
for i in range(1, 6):
    for j in range(1, 11):
        prop_col = f"Component{i}_Property{j}"
        frac_col = f"Component{i}_fraction"
        weighted_col = f"Weighted_Component{i}_Property{j}"
        weighted_data[weighted_col] = data[prop_col] * data[frac_col]
base_features = pd.concat([data[base_features_cols], pd.DataFrame(weighted_data), statistical_features], axis=1).copy()

In [None]:
# --- 2. ADVANCED FEATURE ENGINEERING ---
print(f"Creating advanced interaction features for {TARGET_TO_SOLVE}...")
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(base_features)
X_scaled_df = pd.DataFrame(X_scaled, columns=base_features.columns)

y_target = data[[TARGET_TO_SOLVE]]
pt = PowerTransformer(method='yeo-johnson')
y_transformed = pt.fit_transform(y_target)

rf_temp = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1)
rf_temp.fit(X_scaled_df, y_transformed.ravel())
importances = pd.Series(rf_temp.feature_importances_, index=X_scaled_df.columns)
top_10_base_features = importances.nlargest(10).index

poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
poly_features = poly.fit_transform(X_scaled_df[top_10_base_features])
poly_df = pd.DataFrame(poly_features, columns=poly.get_feature_names_out(top_10_base_features))

X_advanced = pd.concat([X_scaled_df, poly_df], axis=1)
X_advanced = X_advanced.loc[:,~X_advanced.columns.duplicated()].copy()


In [None]:
# --- 3. ADVANCED FEATURE SELECTION (RFECV) ---
print("Finding optimal number of features with RFECV...")
estimator_rfe = lgb.LGBMRegressor(random_state=42, n_jobs=-1)
selector = RFECV(estimator_rfe, step=1, cv=KFold(3), scoring='neg_root_mean_squared_error', n_jobs=-1)
selector.fit(X_advanced, y_transformed.ravel())
print(f"RFECV selected {selector.n_features_} features as optimal.")
X_final = selector.transform(X_advanced)

In [None]:

# --- 4. TRAIN / TEST SPLIT (used for all models) ---
X_train, X_test, y_train, y_test = train_test_split(X_final, y_transformed, test_size=0.2, random_state=42)

In [None]:



# --- 5. MODEL COMPARISON ---

# ## Model 1: Tuned Ridge Regression ##
# --------------------------------------------------------------------------------
print("\n--- Training Model 1: Ridge Regression ---")
param_grid_ridge = {'alpha': [1.0, 10.0, 50.0, 100.0, 200.0, 500.0]}
grid_search_ridge = GridSearchCV(Ridge(), param_grid_ridge, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid_search_ridge.fit(X_train, y_train.ravel())
best_ridge = grid_search_ridge.best_estimator_
y_pred_ridge = best_ridge.predict(X_test)
y_pred_ridge_orig = pt.inverse_transform(y_pred_ridge.reshape(-1, 1))
y_test_orig = pt.inverse_transform(y_test)
mape_ridge = mean_absolute_percentage_error(y_test_orig, y_pred_ridge_orig) * 100
print(f"Best Ridge Alpha: {grid_search_ridge.best_params_['alpha']}")
print(f"Final Test MAPE for Ridge: {mape_ridge:.2f}%")

# ## Model 2: Tuned LightGBM (with Optuna) ##
# --------------------------------------------------------------------------------
print("\n--- Training Model 2: LightGBM ---")
def objective_lgbm(trial):
    params = { 'objective': 'regression_l1', 'metric': 'rmse', 'n_estimators': 1000,
               'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
               'num_leaves': trial.suggest_int('num_leaves', 20, 300),
               'max_depth': trial.suggest_int('max_depth', 3, 12),
               'subsample': trial.suggest_float('subsample', 0.6, 1.0),
               'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
               'random_state': 42, 'n_jobs': 4 }
    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train.ravel(), eval_set=[(X_test, y_test.ravel())], callbacks=[lgb.early_stopping(100, verbose=False)])
    preds = model.predict(X_test)
    return np.sqrt(np.mean((y_test - preds.reshape(-1,1))**2))

study_lgbm = optuna.create_study(direction='minimize')
study_lgbm.optimize(objective_lgbm, n_trials=30, n_jobs=4)
best_params_lgbm = study_lgbm.best_params
final_lgbm = lgb.LGBMRegressor(objective='regression_l1', random_state=42, n_jobs=-1, **best_params_lgbm)
final_lgbm.fit(X_train, y_train.ravel())
y_pred_lgbm = final_lgbm.predict(X_test)
y_pred_lgbm_orig = pt.inverse_transform(y_pred_lgbm.reshape(-1, 1))
mape_lgbm = mean_absolute_percentage_error(y_test_orig, y_pred_lgbm_orig) * 100
print(f"Best LGBM Params: {best_params_lgbm}")
print(f"Final Test MAPE for LightGBM: {mape_lgbm:.2f}%")

# ## Model 3: Tuned CatBoost (with Optuna) ##
# --------------------------------------------------------------------------------
print("\n--- Training Model 3: CatBoost ---")
def objective_catboost(trial):
    params = { 'objective': 'MAE', 'eval_metric': 'RMSE', 'iterations': 1000,
               'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
               'depth': trial.suggest_int('depth', 4, 10),
               'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
               'random_strength': trial.suggest_float('random_strength', 1e-3, 10.0, log=True),
               'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
               'border_count': trial.suggest_int('border_count', 32, 255),
               'random_state': 42, 'verbose': 0, 'thread_count': 4 }
    model = cb.CatBoostRegressor(**params)
    model.fit(X_train, y_train.ravel(), eval_set=[(X_test, y_test.ravel())], early_stopping_rounds=100, verbose=0)
    preds = model.predict(X_test)
    return np.sqrt(np.mean((y_test - preds.reshape(-1,1))**2))

study_catboost = optuna.create_study(direction='minimize')
study_catboost.optimize(objective_catboost, n_trials=30, n_jobs=4)
best_params_catboost = study_catboost.best_params
final_catboost = cb.CatBoostRegressor(objective='MAE', random_state=42, thread_count=-1, **best_params_catboost)
final_catboost.fit(X_train, y_train.ravel(), verbose=0)
y_pred_catboost = final_catboost.predict(X_test)
y_pred_catboost_orig = pt.inverse_transform(y_pred_catboost.reshape(-1, 1))
mape_catboost = mean_absolute_percentage_error(y_test_orig, y_pred_catboost_orig) * 100
print(f"Best CatBoost Params: {best_params_catboost}")
print(f"Final Test MAPE for CatBoost: {mape_catboost:.2f}%")

Creating base features...
Creating advanced interaction features for BlendProperty4...
Finding optimal number of features with RFECV...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008882 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 44852
[LightGBM] [Info] Number of data points in the train set: 2000, number of used features: 180
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009686 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 44800
[LightGBM] [Info] Number of data points in the train set: 2000, number of used features: 179
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014384 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [In

[I 2025-07-19 01:35:45,177] A new study created in memory with name: no-name-9e83b9a8-8be3-45f2-a534-4402868a5e3a


Best Ridge Alpha: 1.0
Final Test MAPE for Ridge: 29.28%

--- Training Model 2: LightGBM ---


[I 2025-07-19 01:35:46,497] Trial 0 finished with value: 0.044838768594467344 and parameters: {'learning_rate': 0.02739216024356818, 'num_leaves': 242, 'max_depth': 5, 'subsample': 0.6854040338161675, 'colsample_bytree': 0.8244523427879662}. Best is trial 0 with value: 0.044838768594467344.
[I 2025-07-19 01:35:46,587] Trial 1 finished with value: 0.31252588004000914 and parameters: {'learning_rate': 0.0020563194321981974, 'num_leaves': 27, 'max_depth': 8, 'subsample': 0.6429106461839705, 'colsample_bytree': 0.5658400280213931}. Best is trial 0 with value: 0.044838768594467344.
[I 2025-07-19 01:35:46,921] Trial 3 finished with value: 0.041263108552252674 and parameters: {'learning_rate': 0.035580614918240445, 'num_leaves': 181, 'max_depth': 12, 'subsample': 0.7057963060651161, 'colsample_bytree': 0.87271335363851}. Best is trial 3 with value: 0.041263108552252674.
[I 2025-07-19 01:35:47,214] Trial 2 finished with value: 0.20802160450919788 and parameters: {'learning_rate': 0.00282361292

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000069 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 1
[LightGBM] [Info] Start training from score 0.017062
Best LGBM Params: {'learning_rate': 0.04641788831220958, 'num_leaves': 116, 'max_depth': 11, 'subsample': 0.7732171710988691, 'colsample_bytree': 0.8586909418857228}
Final Test MAPE for LightGBM: 5.79%

--- Training Model 3: CatBoost ---


[I 2025-07-19 01:35:57,516] Trial 3 finished with value: 0.08271357714302481 and parameters: {'learning_rate': 0.04054944050561109, 'depth': 5, 'l2_leaf_reg': 5.735374599977004, 'random_strength': 2.8530750843592068, 'bagging_temperature': 0.08388515680403053, 'border_count': 35}. Best is trial 3 with value: 0.08271357714302481.
[I 2025-07-19 01:35:57,744] Trial 1 finished with value: 0.35378029953720214 and parameters: {'learning_rate': 0.001674275903188653, 'depth': 5, 'l2_leaf_reg': 0.1610351780025505, 'random_strength': 0.0333521195376103, 'bagging_temperature': 0.14430083863730525, 'border_count': 56}. Best is trial 3 with value: 0.08271357714302481.
[I 2025-07-19 01:35:57,986] Trial 0 finished with value: 0.035724022728896854 and parameters: {'learning_rate': 0.04557169074470192, 'depth': 5, 'l2_leaf_reg': 0.2939002547511921, 'random_strength': 3.551519236369881, 'bagging_temperature': 0.963918731855977, 'border_count': 181}. Best is trial 0 with value: 0.035724022728896854.
[I 2

Best CatBoost Params: {'learning_rate': 0.02062429279315531, 'depth': 4, 'l2_leaf_reg': 2.500894478311855, 'random_strength': 0.08020037666065981, 'bagging_temperature': 0.8119487022093321, 'border_count': 249}
Final Test MAPE for CatBoost: 2.67%


In [17]:
import joblib
import os
MODEL_SAVE_DIR = f"saved_models_{TARGET_TO_SOLVE}"
if not os.path.exists(MODEL_SAVE_DIR):
    os.makedirs(MODEL_SAVE_DIR)

In [18]:
# --- 5. SAVE BEST MODEL AND ARTIFACTS ---
print(f"\n--- Saving artifacts for {TARGET_TO_SOLVE} ---")
models = {'Ridge': (mape_ridge, best_ridge), 
          'LightGBM': (mape_lgbm, final_lgbm), 
          'CatBoost': (mape_catboost, final_catboost)}
best_model_name = min(models, key=lambda k: models[k][0])
best_model_obj = models[best_model_name][1]
print(f"Best model is {best_model_name} with MAPE: {models[best_model_name][0]:.2f}%")

# Save all necessary objects to reproduce the pipeline
joblib.dump(feature_scaler, os.path.join(MODEL_SAVE_DIR, 'feature_scaler.joblib'))
joblib.dump(pt, os.path.join(MODEL_SAVE_DIR, 'power_transformer.joblib'))
joblib.dump(top_10_base_features, os.path.join(MODEL_SAVE_DIR, 'top_10_features.joblib'))
joblib.dump(poly, os.path.join(MODEL_SAVE_DIR, 'polynomial_transformer.joblib'))
joblib.dump(selector, os.path.join(MODEL_SAVE_DIR, 'rfecv_selector.joblib'))
model_filename = os.path.join(MODEL_SAVE_DIR, 'best_model.joblib')
if isinstance(best_model_obj, cb.CatBoostRegressor):
    model_filename = os.path.join(MODEL_SAVE_DIR, 'best_model.cbm')
    best_model_obj.save_model(model_filename)
else:
    joblib.dump(best_model_obj, model_filename)
print(f"All artifacts saved to {MODEL_SAVE_DIR}")


--- Saving artifacts for BlendProperty4 ---
Best model is CatBoost with MAPE: 2.67%
All artifacts saved to saved_models_BlendProperty4


In [None]:
# --- 6. PREDICTION ON NEW DATA ---
print(f"\n--- Generating predictions for {TARGET_TO_SOLVE} on {TEST_FILE} ---")
# Load test data
test_data = pd.read_csv("train.csv")
if 'Id' not in test_data.columns:
    test_data['Id'] = test_data.index

# Load saved artifacts
scaler = joblib.load(os.path.join(MODEL_SAVE_DIR, 'feature_scaler.joblib'))
pt = joblib.load(os.path.join(MODEL_SAVE_DIR, 'power_transformer.joblib'))
top_10 = joblib.load(os.path.join(MODEL_SAVE_DIR, 'top_10_features.joblib'))
poly = joblib.load(os.path.join(MODEL_SAVE_DIR, 'polynomial_transformer.joblib'))
selector = joblib.load(os.path.join(MODEL_SAVE_DIR, 'rfecv_selector.joblib'))
try:
    model = joblib.load(os.path.join(MODEL_SAVE_DIR, 'best_model.joblib'))
except:
    model = cb.CatBoostRegressor()
    model.load_model(os.path.join(MODEL_SAVE_DIR, 'best_model.cbm'))

# Apply the exact same transformation pipeline
base_features_test = pd.concat([test_data[base_features_cols], pd.DataFrame(weighted_data), statistical_features], axis=1).copy()
X_scaled_test = scaler.transform(base_features_test)
X_scaled_df_test = pd.DataFrame(X_scaled_test, columns=base_features.columns)

poly_features_test = poly.transform(X_scaled_df_test[top_10])
poly_df_test = pd.DataFrame(poly_features_test, columns=poly.get_feature_names_out(top_10))
X_advanced_test = pd.concat([X_scaled_df_test, poly_df_test], axis=1)
X_advanced_test = X_advanced_test.loc[:,~X_advanced_test.columns.duplicated()].copy()

X_final_test = selector.transform(X_advanced_test)

# Predict and inverse transform
predictions_transformed = model.predict(X_final_test)
final_predictions = pt.inverse_transform(predictions_transformed.reshape(-1, 1))

# Create final DataFrame
prediction_df = pd.DataFrame({
    'Id': test_data['Id'],
    TARGET_TO_SOLVE: final_predictions.flatten()
})

print("\n--- Prediction DataFrame ---")
print(prediction_df.head())

# You can save this DataFrame to a CSV
# prediction_df.to_csv(f"predictions_{TARGET_TO_SOLVE}.csv", index=False)