In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler, PowerTransformer



In [None]:
# Load the data
data = pd.read_csv("train.csv")

#
base_features = [col for col in data.columns if col not in [f"BlendProperty{i}" for i in range(1, 11)]]
targets = [f"BlendProperty{i}" for i in range(1, 11)]

In [None]:
# Outlier handling using IQR method
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower_bound, upper_bound)
    return df

for column in data.columns:
    data = remove_outliers(data, column)

In [None]:
# Creating  weighted columns
weighted_data = {}
for i in range(1, 6):
    for j in range(1, 11):
        prop_col = f"Component{i}_Property{j}"
        frac_col = f"Component{i}_fraction"
        weighted_col = f"Weighted_Component{i}_Property{j}"
        weighted_data[weighted_col] = data[prop_col] * data[frac_col]

# Combine
data_with_weighted = pd.concat([data[base_features], pd.DataFrame(weighted_data)], axis=1)
data_with_weighted = data_with_weighted.copy()  # De-fragmented copy

In [None]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data_with_weighted)
scaled_df_features = pd.DataFrame(scaled_features, columns=data_with_weighted.columns)


pt = PowerTransformer(method='yeo-johnson')
transformed_targets_df = pd.DataFrame(pt.fit_transform(data[targets]), columns=targets, index=data.index)

# Initial feature selection using Random Forest feature importance
top_features = {}
for target in targets:
    X = scaled_df_features
    y = transformed_targets_df[target]
    rf_initial = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_initial.fit(X, y)
    feature_importance = pd.Series(rf_initial.feature_importances_, index=scaled_df_features.columns).sort_values(ascending=False)
    top_features[target] = feature_importance.head(20).index.tolist()

# GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20,]

}


In [1]:

# Train and evaluate
for target in targets:
    X = scaled_df_features[top_features[target]]
    y = transformed_targets_df[target]


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # GridSearchCV for hyperparameter tuning
    rf = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_absolute_percentage_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Best model
    best_rf = grid_search.best_estimator_
    print(f"Best params for {target}: {grid_search.best_params_}")


    cv_mape = -cross_val_score(best_rf, X_train, y_train, cv=5, scoring='neg_mean_absolute_percentage_error').mean() * 100
    print(f"Cross-validated MAPE for {target} (transformed): {cv_mape:.2f}%")


    y_pred = best_rf.predict(X_test)

    y_test_full = pd.DataFrame(np.column_stack([y_test] + [np.zeros(len(y_test))]*(len(targets)-1)), columns=targets, index=X_test.index)
    y_pred_full = pd.DataFrame(np.column_stack([y_pred] + [np.zeros(len(y_pred))]*(len(targets)-1)), columns=targets, index=X_test.index)
    y_test_inv = pt.inverse_transform(y_test_full)[targets.index(target)]
    y_pred_inv = pt.inverse_transform(y_pred_full)[targets.index(target)]

    # Handle zero/near-zero values in MAPE calculation
    mask = y_test_inv != 0
    test_mape = np.mean(np.abs((y_test_inv[mask] - y_pred_inv[mask]) / y_test_inv[mask]) * 100) if mask.any() else 0.0
    print(f"Test MAPE for {target} (original scale): {test_mape:.2f}%")


Best params for BlendProperty1: {'max_depth': 20, 'n_estimators': 200}
Cross-validated MAPE for BlendProperty1 (transformed): 129.99%
Test MAPE for BlendProperty1 (original scale): 5.77%
Best params for BlendProperty2: {'max_depth': 20, 'n_estimators': 100}
Cross-validated MAPE for BlendProperty2 (transformed): 138.59%
Test MAPE for BlendProperty2 (original scale): 2.37%
Best params for BlendProperty3: {'max_depth': 20, 'n_estimators': 100}
Cross-validated MAPE for BlendProperty3 (transformed): 127.84%
Test MAPE for BlendProperty3 (original scale): 2.71%
Best params for BlendProperty4: {'max_depth': 20, 'n_estimators': 100}
Cross-validated MAPE for BlendProperty4 (transformed): 87.94%
Test MAPE for BlendProperty4 (original scale): 48.63%
Best params for BlendProperty5: {'max_depth': 10, 'n_estimators': 100}
Cross-validated MAPE for BlendProperty5 (transformed): 7.63%
Test MAPE for BlendProperty5 (original scale): 0.01%
Best params for BlendProperty6: {'max_depth': 20, 'n_estimators': 2