In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler, PowerTransformer,QuantileTransformer
from sklearn.model_selection import GridSearchCV, cross_val_score
from catboost import CatBoostRegressor,Pool

In [19]:
import os
import pandas as pd

# Construct the path
base_path = os.path.join("/", "Volumes", "Extreme SSD", "ShellAi")
train_path = os.path.join(base_path, "train.csv")
test_path = os.path.join(base_path, "test.csv")

# Load the CSV files
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [13]:
print("Expected targets:", targets)
print("Actual test_df columns:", test_df.columns.tolist())

Expected targets: ['BlendProperty1', 'BlendProperty2', 'BlendProperty3', 'BlendProperty4', 'BlendProperty5', 'BlendProperty6', 'BlendProperty7', 'BlendProperty8', 'BlendProperty9', 'BlendProperty10']
Actual test_df columns: ['ID', 'Component1_fraction', 'Component2_fraction', 'Component3_fraction', 'Component4_fraction', 'Component5_fraction', 'Component1_Property1', 'Component2_Property1', 'Component3_Property1', 'Component4_Property1', 'Component5_Property1', 'Component1_Property2', 'Component2_Property2', 'Component3_Property2', 'Component4_Property2', 'Component5_Property2', 'Component1_Property3', 'Component2_Property3', 'Component3_Property3', 'Component4_Property3', 'Component5_Property3', 'Component1_Property4', 'Component2_Property4', 'Component3_Property4', 'Component4_Property4', 'Component5_Property4', 'Component1_Property5', 'Component2_Property5', 'Component3_Property5', 'Component4_Property5', 'Component5_Property5', 'Component1_Property6', 'Component2_Property6', 'Compo

In [8]:
# Creating  weighted columns
base_features = [col for col in test_df.columns if col not in [f"BlendProperty{i}" for i in range(1, 11)]]
targets = [f"BlendProperty{i}" for i in range(1, 11)]
weighted_data = {}
for i in range(1, 6):
    for j in range(1, 11):
        prop_col = f"Component{i}_Property{j}"
        frac_col = f"Component{i}_fraction"
        weighted_col = f"Weighted_Component{i}_Property{j}"
        weighted_data[weighted_col] = test_df[prop_col] * test_df[frac_col]

# Combine
data_with_weighted = pd.concat([test_df[base_features], pd.DataFrame(weighted_data)], axis=1)
data_with_weighted = data_with_weighted.copy()  # De-fragmented copy

In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

# STEP 0: Define target and feature columns
targets = [f"BlendProperty{i}" for i in range(1, 11)]
base_features = [col for col in train_df.columns if col not in targets]

# STEP 1: Scale training features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(train_df[base_features])
scaled_df_features = pd.DataFrame(scaled_features, columns=base_features, index=train_df.index)

# STEP 2: Apply PowerTransformer to training targets
pt = PowerTransformer(method='yeo-johnson')
transformed_targets_df = pd.DataFrame(
    pt.fit_transform(train_df[targets]),
    columns=targets,
    index=train_df.index
)

# STEP 3: Feature selection using Random Forest
top_features = {}
for target in targets:
    X = scaled_df_features
    y = transformed_targets_df[target]
    
    rf_initial = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_initial.fit(X, y)
    
    feature_importance = pd.Series(
        rf_initial.feature_importances_,
        index=scaled_df_features.columns
    ).sort_values(ascending=False)
    
    top_features[target] = feature_importance.head(20).index.tolist()

# STEP 4: Prepare scaled test features
scaled_test_features = scaler.transform(test_df[base_features])
scaled_df_test = pd.DataFrame(scaled_test_features, columns=base_features, index=test_df.index)

# STEP 5: Train CatBoost and Predict
all_preds_transformed = []

for target in targets:
    print(f"🔹 Training and predicting for: {target}")
    
    X_train = scaled_df_features[top_features[target]]
    y_train = transformed_targets_df[target]
    X_test = scaled_df_test[top_features[target]]
    
    model = CatBoostRegressor(verbose=0, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred_transformed = model.predict(X_test)
    all_preds_transformed.append(y_pred_transformed)

# STEP 6: Inverse transform all predictions together
all_preds_transformed = np.column_stack(all_preds_transformed)  # Shape: (n_samples, 10)
all_preds_original = pt.inverse_transform(all_preds_transformed)

# STEP 7: Prepare submission DataFrame
submission = pd.DataFrame(all_preds_original, columns=targets, index=test_df.index)

# Include ID column if available
if 'ID' in test_df.columns:
    submission.insert(0, 'ID', test_df['ID'].values)

# STEP 8: Save to CSV
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv saved successfully!")

🔹 Training and predicting for: BlendProperty1
🔹 Training and predicting for: BlendProperty2
🔹 Training and predicting for: BlendProperty3
🔹 Training and predicting for: BlendProperty4
🔹 Training and predicting for: BlendProperty5
🔹 Training and predicting for: BlendProperty6
🔹 Training and predicting for: BlendProperty7
🔹 Training and predicting for: BlendProperty8
🔹 Training and predicting for: BlendProperty9
🔹 Training and predicting for: BlendProperty10
✅ submission.csv saved successfully!
