In [3]:
import os
import pandas as pd

# Construct the path
base_path = os.path.join("/", "Volumes", "Extreme SSD", "ShellAi")
train_path = os.path.join(base_path, "train.csv")
test_path = os.path.join(base_path, "test.csv")

# Load the CSV files
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# STEP 0: Define target and feature columns
targets = [f"BlendProperty{i}" for i in range(1, 11)]
base_features = [col for col in train_df.columns if col not in targets]

# STEP 1: Scale training features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(train_df[base_features])
scaled_df_features = pd.DataFrame(scaled_features, columns=base_features, index=train_df.index)

# STEP 2: Apply PowerTransformer to training targets
pt = PowerTransformer(method='yeo-johnson')
transformed_targets_df = pd.DataFrame(
    pt.fit_transform(train_df[targets]),
    columns=targets,
    index=train_df.index
)

# STEP 3: Feature selection using RandomForest
top_features = {}
for target in targets:
    X = scaled_df_features
    y = transformed_targets_df[target]
    
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X, y)
    
    feature_importance = pd.Series(
        rf.feature_importances_,
        index=scaled_df_features.columns
    ).sort_values(ascending=False)
    
    top_features[target] = feature_importance.head(20).index.tolist()

# STEP 4: Prepare scaled test features
scaled_test_features = scaler.transform(test_df[base_features])
scaled_df_test = pd.DataFrame(scaled_test_features, columns=base_features, index=test_df.index)

# STEP 5: Train XGBoost and Predict
all_preds_transformed = []

for target in targets:
    print(f"🔹 Training XGBoost for: {target}")
    
    X_train = scaled_df_features[top_features[target]]
    y_train = transformed_targets_df[target]
    X_test = scaled_df_test[top_features[target]]
    
    model = XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.05, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred_transformed = model.predict(X_test)
    all_preds_transformed.append(y_pred_transformed)

# STEP 6: Inverse transform all predictions together (with correct feature names)
all_preds_transformed = np.column_stack(all_preds_transformed)  # Shape: (n_samples, 10)
all_preds_df = pd.DataFrame(all_preds_transformed, columns=targets)
all_preds_original = pt.inverse_transform(all_preds_df)

# STEP 7: Prepare submission DataFrame
submission_3 = pd.DataFrame(all_preds_original, columns=targets, index=test_df.index)

# Include ID column if available
if 'ID' in test_df.columns:
    submission_3.insert(0, 'ID', test_df['ID'].values)

# STEP 8: Save to CSV
submission_3.to_csv("submission_3.csv", index=False)
print("✅ XGBoost submission_3.csv saved successfully!")

🔹 Training XGBoost for: BlendProperty1
🔹 Training XGBoost for: BlendProperty2
🔹 Training XGBoost for: BlendProperty3
🔹 Training XGBoost for: BlendProperty4
🔹 Training XGBoost for: BlendProperty5
🔹 Training XGBoost for: BlendProperty6
🔹 Training XGBoost for: BlendProperty7
🔹 Training XGBoost for: BlendProperty8
🔹 Training XGBoost for: BlendProperty9
🔹 Training XGBoost for: BlendProperty10
✅ XGBoost submission.csv saved successfully!
