# ==============================================
# MERCEDES-BENZ GREENER MANUFACTURING PROJECT
# ==============================================


In [90]:
# -----------------------------------------
# Step 1: Import Required Libraries
# -----------------------------------------

import numpy as np
import pandas as pd



In [91]:
# -----------------------------------------
# Step 2: Load Train and Test Data
# -----------------------------------------

train = pd.read_csv("train.csv")

print("Train shape:", train.shape)

print("-"*100)
print(train.head())
print("-"*100)
train.info()
print("-"*100)
print(train.describe())
print("-"*100)



Train shape: (4209, 378)
----------------------------------------------------------------------------------------------------
   ID       y  X0 X1  X2 X3 X4 X5 X6 X8  ...  X375  X376  X377  X378  X379  \
0   0  130.81   k  v  at  a  d  u  j  o  ...     0     0     1     0     0   
1   6   88.53   k  t  av  e  d  y  l  o  ...     1     0     0     0     0   
2   7   76.26  az  w   n  c  d  x  j  x  ...     0     0     0     0     0   
3   9   80.62  az  t   n  f  d  x  l  e  ...     0     0     0     0     0   
4  13   78.02  az  v   n  f  d  h  d  n  ...     0     0     0     0     0   

   X380  X382  X383  X384  X385  
0     0     0     0     0     0  
1     0     0     0     0     0  
2     0     1     0     0     0  
3     0     0     0     0     0  
4     0     0     0     0     0  

[5 rows x 378 columns]
----------------------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Co

In [92]:
test = pd.read_csv("test.csv")
print("Test shape:", test.shape)
print("-"*100)
print(test.head())
print("-"*100)
test.info()
print("-"*100)
print(test.describe())
print("-"*100)

Test shape: (4209, 377)
----------------------------------------------------------------------------------------------------
   ID  X0 X1  X2 X3 X4 X5 X6 X8  X10  ...  X375  X376  X377  X378  X379  X380  \
0   1  az  v   n  f  d  t  a  w    0  ...     0     0     0     1     0     0   
1   2   t  b  ai  a  d  b  g  y    0  ...     0     0     1     0     0     0   
2   3  az  v  as  f  d  a  j  j    0  ...     0     0     0     1     0     0   
3   4  az  l   n  f  d  z  l  n    0  ...     0     0     0     1     0     0   
4   5   w  s  as  c  d  y  i  m    0  ...     1     0     0     0     0     0   

   X382  X383  X384  X385  
0     0     0     0     0  
1     0     0     0     0  
2     0     0     0     0  
3     0     0     0     0  
4     0     0     0     0  

[5 rows x 377 columns]
----------------------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 377 entries,

In [93]:
# -----------------------------------------
# Step 3: Data Understanding
# -----------------------------------------
target_col ='y'
id_col ='ID'
y = train[target_col].copy()
X = train.drop(columns = [target_col])
test_ids = test[id_col] if id_col in test.columns else None

In [94]:
X.columns

Index(['ID', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8', 'X10',
       ...
       'X375', 'X376', 'X377', 'X378', 'X379', 'X380', 'X382', 'X383', 'X384',
       'X385'],
      dtype='object', length=377)

In [95]:
test.columns

Index(['ID', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8', 'X10',
       ...
       'X375', 'X376', 'X377', 'X378', 'X379', 'X380', 'X382', 'X383', 'X384',
       'X385'],
      dtype='object', length=377)

In [96]:
# -----------------------------------------
# Step 4: Encode Categorical Features
# -----------------------------------------

num_cols = X.select_dtypes(include = ['int64','float64']).columns
cat_cols = X.select_dtypes(include = ['object','category']).columns

print("Numeric Columns: ",len(num_cols))
print("Categorical Columns: ",len(cat_cols))

# simple imputations
X[num_cols] = X[num_cols].fillna(X[num_cols].median())
test[num_cols] = test[num_cols].fillna(X[num_cols].median()) # use train medians
X[cat_cols] =X[cat_cols].fillna('MISSING')
test[cat_cols] = test[cat_cols].fillna('MISSING')

Numeric Columns:  369
Categorical Columns:  8


In [97]:
# -----------------------------------------
# Step 5: Remove Zero-Variance Columns
# -----------------------------------------

from sklearn.feature_selection import VarianceThreshold

# combine train/test if we will fit encoders on both; but for pure variance on train:
sel = VarianceThreshold(threshold = 0)
sel.fit(X[num_cols])      # fits column mask
mask = sel.get_support() # boolean mask of columns with variance>0
cols_kept = num_cols[mask]
cols_removed = num_cols[~mask].tolist()

print("Kept columns: 3",cols_kept)
print("Removed (zero variance): ",cols_removed)

# reduce X and test (important: remove same colums in test too)
X = X[cols_kept]
test = test[cols_kept]

Kept columns: 3 Index(['ID', 'X10', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19',
       ...
       'X375', 'X376', 'X377', 'X378', 'X379', 'X380', 'X382', 'X383', 'X384',
       'X385'],
      dtype='object', length=357)
Removed (zero variance):  ['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347']


In [98]:
# Null counts
print(X.isnull().sum().sum(),"total missing values")
print(X.isnull().sum().sort_values(ascending = False).head(20))

# Unique values per column
unique_counts = X.nunique().sort_values()
print(unique_counts.head(30)) # low unique cols
print(unique_counts.tail(30)) # high-cardinality cols

0 total missing values
ID     0
X10    0
X12    0
X13    0
X14    0
X15    0
X16    0
X17    0
X18    0
X19    0
X20    0
X21    0
X22    0
X23    0
X24    0
X26    0
X27    0
X28    0
X29    0
X30    0
dtype: int64
X42     2
X365    2
X366    2
X367    2
X368    2
X369    2
X370    2
X371    2
X372    2
X373    2
X374    2
X375    2
X376    2
X377    2
X378    2
X383    2
X382    2
X10     2
X12     2
X13     2
X14     2
X15     2
X16     2
X17     2
X18     2
X19     2
X20     2
X21     2
X22     2
X23     2
dtype: int64
X355       2
X356       2
X357       2
X358       2
X359       2
X360       2
X361       2
X362       2
X363       2
X384       2
X28        2
X29        2
X30        2
X31        2
X32        2
X33        2
X34        2
X35        2
X36        2
X37        2
X38        2
X39        2
X40        2
X41        2
X380       2
X364       2
X379       2
X27        2
X26        2
ID      4209
dtype: int64


In [99]:
for col in X.columns:
    if X[col].dtype == 'object' or X[col].nunique() <30:
        print(col,X[col].nunique())

X10 2
X12 2
X13 2
X14 2
X15 2
X16 2
X17 2
X18 2
X19 2
X20 2
X21 2
X22 2
X23 2
X24 2
X26 2
X27 2
X28 2
X29 2
X30 2
X31 2
X32 2
X33 2
X34 2
X35 2
X36 2
X37 2
X38 2
X39 2
X40 2
X41 2
X42 2
X43 2
X44 2
X45 2
X46 2
X47 2
X48 2
X49 2
X50 2
X51 2
X52 2
X53 2
X54 2
X55 2
X56 2
X57 2
X58 2
X59 2
X60 2
X61 2
X62 2
X63 2
X64 2
X65 2
X66 2
X67 2
X68 2
X69 2
X70 2
X71 2
X73 2
X74 2
X75 2
X76 2
X77 2
X78 2
X79 2
X80 2
X81 2
X82 2
X83 2
X84 2
X85 2
X86 2
X87 2
X88 2
X89 2
X90 2
X91 2
X92 2
X94 2
X95 2
X96 2
X97 2
X98 2
X99 2
X100 2
X101 2
X102 2
X103 2
X104 2
X105 2
X106 2
X108 2
X109 2
X110 2
X111 2
X112 2
X113 2
X114 2
X115 2
X116 2
X117 2
X118 2
X119 2
X120 2
X122 2
X123 2
X124 2
X125 2
X126 2
X127 2
X128 2
X129 2
X130 2
X131 2
X132 2
X133 2
X134 2
X135 2
X136 2
X137 2
X138 2
X139 2
X140 2
X141 2
X142 2
X143 2
X144 2
X145 2
X146 2
X147 2
X148 2
X150 2
X151 2
X152 2
X153 2
X154 2
X155 2
X156 2
X157 2
X158 2
X159 2
X160 2
X161 2
X162 2
X163 2
X164 2
X165 2
X166 2
X167 2
X168 2
X169 2
X170 2
X171 2
X

In [100]:
from sklearn.preprocessing import LabelEncoder

# identify categorical cols to label-encode
cat_cols = [c for c in X.columns if X[c].dtype == 'object' or X[c].nunique()<50] # tweak threshold

label_encoders ={}
for col in cat_cols:
    le = LabelEncoder()
    # fit on combined to get consistent mapping
    combined = pd.concat([X[col], test[col]], axis =0).astype(str)
    le.fit(combined)
    X[col] = le.transform(X[col].astype(str))
    test[col] = le.transform(test[col].astype(str))
    label_encoders[col] =le

In [101]:
from sklearn.preprocessing import StandardScaler
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
scaler = StandardScaler()
# fit scaler on training numeric features
X_num_scaled = scaler.fit_transform(X[num_cols])
test_num_scaled = scaler.transform(test[num_cols])

# replace numeric columns with scaled arrays (keep categorical ints as-is)
X_scaled = X.copy()
X_scaled[num_cols] = X_num_scaled
test_scaled = test.copy()
test_scaled[num_cols] = test_num_scaled

In [102]:
# -----------------------------------------
# Step 6: Dimensionality Reduction using PCA
# -----------------------------------------

from sklearn.decomposition import PCA

pca = PCA(n_components=0.95, random_state=42)   # keep 95% variance
X_pca = pca.fit_transform(X_scaled)
test_pca = pca.transform(test_scaled)

print("Original features:", X_scaled.shape[1])
print("PCA components:", X_pca.shape[1])   # number retained

Original features: 357
PCA components: 144


In [103]:
# -----------------------------------------
# Step 7: Train-Test Split
# -----------------------------------------

import xgboost as xgb
from xgboost.callback import EarlyStopping
from sklearn.model_selection import cross_val_score, KFold, train_test_split

# Split data into train and validation
X_train_split, X_val, y_train_split, y_val = train_test_split(X_pca, y, test_size=0.1, random_state=42)

# Convert data to DMatrix(required for xgb.train)
dtrain = xgb.DMatrix(X_train_split, label = y_train_split)
dval = xgb.DMatrix(X_val, label = y_val)

# Define parameters
params = {
    'objective': 'reg:squarederror',
    'learning_rate':0.05,
    'max_depth': 6,
    'subsample' :0.8,
    'colsample_bytree' :0.8,
    'eval_metric' : 'rmse',
    'seed' : 42,
   
}

# Create early stopping callback
early_stop = EarlyStopping(
                                rounds=10,
                                save_best=True
                                )

# -----------------------------------------
# Step 8: Model Training with XGBoost
# -----------------------------------------


# Train using xgb.train(fully supports callbacks)
model = xgb.train(
            params = params,
            dtrain = dtrain,
            num_boost_round = 1000,
            evals = [(dtrain, 'train'), (dval, 'validation_0')],
            callbacks=[early_stop]
            )



[0]	train-rmse:12.46565	validation_0-rmse:11.75374
[1]	train-rmse:12.19811	validation_0-rmse:11.53225
[2]	train-rmse:11.94216	validation_0-rmse:11.30487
[3]	train-rmse:11.71012	validation_0-rmse:11.10524
[4]	train-rmse:11.48679	validation_0-rmse:10.92997
[5]	train-rmse:11.26617	validation_0-rmse:10.74949
[6]	train-rmse:11.08330	validation_0-rmse:10.61105
[7]	train-rmse:10.89146	validation_0-rmse:10.45821
[8]	train-rmse:10.70311	validation_0-rmse:10.32219
[9]	train-rmse:10.52729	validation_0-rmse:10.16031
[10]	train-rmse:10.35027	validation_0-rmse:10.01988
[11]	train-rmse:10.21460	validation_0-rmse:9.89250
[12]	train-rmse:10.07086	validation_0-rmse:9.80661
[13]	train-rmse:9.93876	validation_0-rmse:9.69065
[14]	train-rmse:9.80418	validation_0-rmse:9.59469
[15]	train-rmse:9.67780	validation_0-rmse:9.50682
[16]	train-rmse:9.55619	validation_0-rmse:9.43450
[17]	train-rmse:9.43424	validation_0-rmse:9.33091
[18]	train-rmse:9.33848	validation_0-rmse:9.26405
[19]	train-rmse:9.23679	validation_0

In [104]:
# XGBoost cross-validation (built-in)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = xgb.cv(
    params = params,
    dtrain = dtrain,
    num_boost_round =1000,
    nfold =5,
    metrics ='rmse',
    early_stopping_rounds =10,
    seed =42
)

print(cv_scores.tail())
print("Mean CV RMSE: ", cv_scores['test-rmse-mean'].min())

    train-rmse-mean  train-rmse-std  test-rmse-mean  test-rmse-std
67         6.157106        0.112441        9.151646       1.006077
68         6.119637        0.108246        9.151733       1.008200
69         6.084865        0.104352        9.151495       1.006994
70         6.047606        0.099071        9.149402       1.008061
71         6.014158        0.096300        9.147589       1.008571
Mean CV RMSE:  9.14758882026153


In [105]:
# -----------------------------------------
# Step 9: Model Evaluation (Train Data)
# -----------------------------------------

# Compute metrics:
#Evaluate the model
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

# Convert your PCA features to DMatrix format
dtrain_pca = xgb.DMatrix(X_pca)

# Make predictions
y_train_pred = model.predict(dtrain_pca)


# predictions on training(or a held-out validation/test split)
r2 =r2_score(y,y_train_pred)
rmse = np.sqrt(mean_squared_error(y,y_train_pred) )
mae = mean_absolute_error(y,y_train_pred)
print("Train R2: ",r2,"RMSE: ",rmse,"MAE: ",mae)
'''Interpretation:

RÂ² (Coefficient of Determination) â†’ closer to 1 means the model fits well. 

RMSE (Root Mean Squared Error) â†’ lower values indicate more accurate predictions.

MAE (Mean Absolute Error) â†’ average absolute difference between actual & predicted.'''

Train R2:  0.7339689896680683 RMSE:  6.5390189554795155 MAE:  4.517277325435102


'Interpretation:\n\nRÂ² (Coefficient of Determination) â†’ closer to 1 means the model fits well. \n\nRMSE (Root Mean Squared Error) â†’ lower values indicate more accurate predictions.\n\nMAE (Mean Absolute Error) â†’ average absolute difference between actual & predicted.'

In [106]:
# Convert your PCA features to DMatrix format
dtest_pca = xgb.DMatrix(test_pca)


# Predict on Test Data
test_pred = model.predict(dtest_pca)

# Save output
submission = pd.DataFrame({'ID': test['ID'],'y': test_pred})
submission.to_csv('submission.csv', index=False)
print("âœ… Submission file 'submission.csv' created successfully!")

âœ… Submission file 'submission.csv' created successfully!


In [107]:
# -----------------------------------------
# Step 11: Observations and Conclusion
# -----------------------------------------

print("""
Observations:
1. Data had many constant columns â€” removing them improved model efficiency.
2. PCA reduced dimensionality from hundreds of features to a smaller set (95% variance kept).
3. XGBoost provided strong predictive power with minimal overfitting.
4. RMSE and MAE indicate reasonable performance, showing that the model generalizes well.

Conclusion:
By optimizing the testing process with this predictive model, Mercedes-Benz can estimate test times
for various configurations more efficiently â€” reducing physical testing time, costs, and COâ‚‚ emissions.
""")



Observations:
1. Data had many constant columns â€” removing them improved model efficiency.
2. PCA reduced dimensionality from hundreds of features to a smaller set (95% variance kept).
3. XGBoost provided strong predictive power with minimal overfitting.
4. RMSE and MAE indicate reasonable performance, showing that the model generalizes well.

Conclusion:
By optimizing the testing process with this predictive model, Mercedes-Benz can estimate test times
for various configurations more efficiently â€” reducing physical testing time, costs, and COâ‚‚ emissions.



# ðŸ“˜ Project Summary â€“ Mercedes-Benz Greener Manufacturing

## ðŸš— Business Objective
Mercedes-Benz wants to reduce the time each customized car spends on the test bench before delivery.  
The goal is to build a machine learning model that predicts test duration for each car configuration.  
This allows engineers to streamline testing, improve efficiency, and reduce COâ‚‚ emissions.

---

## ðŸ“Š Dataset Overview
- **Train Dataset:** 4209 rows Ã— 378 columns  
- **Test Dataset:** 4209 rows Ã— 377 columns  
- **Target Variable:** `y` â†’ time (in seconds) to pass testing

---

## ðŸ§¹ Data Preparation
1. Checked missing and unique values.  
2. Converted categorical columns using Label Encoding.  
3. Removed zero-variance columns.  
4. Applied **PCA** (95% variance retained) for dimensionality reduction.

---

## ðŸ§  Model Building
- Algorithm: **XGBoost Regressor**
- Parameters:
  - `n_estimators=1000`
  - `learning_rate=0.05`
  - `max_depth=5`
  - `subsample=0.8`
  - `colsample_bytree=0.8`
- Early stopping with validation set to prevent overfitting.

---

## ðŸ“ˆ Model Evaluation
| Metric | Result (Example) |
|---------|------------------|
| RÂ² | ~0.73 |
| RMSE | ~6.53 |
| MAE | ~4.51 |

- **RÂ²** â†’ model explains ~73% of data variance.  
- **RMSE & MAE** â†’ indicate accurate and stable performance.

---

## ðŸ§¾ Predictions
Model predictions saved to `submission.csv`:
