In [None]:
# ---------------------------------------------
# MERCEDES BENZ TEST BENCH REDUCTION PROJECT
# ---------------------------------------------

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np # Import numpy for sqrt

# Load Data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Split features and target
X = train.drop("y", axis=1)
y = train["y"]

# Check for nulls
print("Nulls in Train:\n", X.isnull().sum().sum())
print("Nulls in Test:\n", test.isnull().sum().sum())

# Combine train + test for consistent label encoding
combined = pd.concat([X, test], axis=0)

# Label Encode categorical columns
for col in combined.columns:
    if combined[col].dtype == "object":
        lbl = LabelEncoder()
        combined[col] = lbl.fit_transform(combined[col].astype(str))

# Split back out and create explicit copies to avoid SettingWithCopyWarning
X = combined.iloc[:len(X), :].copy()
test = combined.iloc[len(X):, :].copy()

# Remove Zero Variance Columns
zero_var_cols = [col for col in X.columns if X[col].nunique() == 1]
X.drop(columns=zero_var_cols, inplace=True)
test.drop(columns=zero_var_cols, inplace=True)

print(f"Removed {len(zero_var_cols)} zero-variance features.")

# PCA for Dimensionality Reduction
pca = PCA(n_components=0.95)   # retain 95% variance
X_pca = pca.fit_transform(X)
test_pca = pca.transform(test)

print("Reduced from", X.shape[1], "to", X_pca.shape[1], "features using PCA.")

# Train-test split for evaluation
X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Train XGBoost
model = XGBRegressor(
    n_estimators=350,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method='hist'
)
model.fit(X_train, y_train)

# Evaluate Model
y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Validation RMSE: {rmse}")

# Predict Test Set
test_predictions = model.predict(test_pca)

# Save Output
output = pd.DataFrame({"y": test_predictions})
output.to_csv("predictions.csv", index=False)
print("✅ Predictions saved to predictions.csv")

Nulls in Train:
 0
Nulls in Test:
 0
Removed 12 zero-variance features.
Reduced from 365 to 1 features using PCA.
Validation RMSE: 13.057165516354381
✅ Predictions saved to predictions.csv
