In [1]:
# ============================================
# 📌 Project 1: Mercedes-Benz Greener Manufacturing
# Goal: Reduce the time cars spend on the test bench
# ============================================


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error



In [4]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print(f'User uploaded file "{fn}" with length {len(uploaded[fn])} bytes')

Saving test.zip to test.zip
Saving train.zip to train.zip
User uploaded file "test.zip" with length 225217 bytes
User uploaded file "train.zip" with length 245705 bytes


In [6]:
import zipfile
import io

with zipfile.ZipFile(io.BytesIO(uploaded['train.zip']), 'r') as z:
    train_df = pd.read_csv(z.open('train.csv'))

with zipfile.ZipFile(io.BytesIO(uploaded['test.zip']), 'r') as z:
    test_df = pd.read_csv(z.open('test.csv'))

print("Shape of train_df:", train_df.shape)
print("Shape of test_df:", test_df.shape)

Shape of train_df: (4209, 378)
Shape of test_df: (4209, 377)


In [8]:
X = train_df.drop(['ID', 'y'], axis=1)
y = train_df['y']

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (4209, 376)
Shape of y: (4209,)


In [11]:
# Apply Label Encoding to categorical features
for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        # Apply the same encoding to the test data, handling potential new categories
        test_df[col] = test_df[col].map(lambda s: '<unknown>' if s not in le.classes_ else s)
        le.classes_ = np.append(le.classes_, '<unknown>')
        test_df[col] = le.transform(test_df[col])


# PCA to reduce dimensions
pca = PCA(n_components=50)  # You can tune this number
X_pca = pca.fit_transform(X)
# Need to drop the 'ID' column from test_df before applying PCA
test_df_processed = test_df.drop('ID', axis=1)
test_pca = pca.transform(test_df_processed)

print("Explained variance ratio (first 10):", pca.explained_variance_ratio_[:10])

Explained variance ratio (first 10): [0.38334782 0.21388033 0.13261866 0.11826642 0.09206008 0.01590604
 0.0074454  0.00433701 0.00294021 0.00241796]


In [12]:
# XGBoost Regressor
xgb_model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb_model.fit(X_pca, y)


In [14]:
# Predict test values
y_pred = xgb_model.predict(test_pca)

# Save submission
submission = pd.DataFrame({
    "ID": test_df['ID'],
    "y": y_pred
})
submission.to_csv("submission.csv", index=False)

print("✅ Submission file created: submission.csv")
submission.head()

✅ Submission file created: submission.csv


Unnamed: 0,ID,y
0,1,77.625755
1,2,93.909332
2,3,75.374985
3,4,78.367088
4,5,119.996651


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size=0.2, random_state=42)

xgb_model.fit(X_train, y_train)
y_val_pred = xgb_model.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("Validation RMSE:", rmse)


Validation RMSE: 9.253336077412452
