### Mercedes-Benz Greener Manufacturing

**Problem Statement** - You are required to reduce the time that cars spend on the test bench. Others will work with a dataset representing different permutations of features in a Mercedes-Benz car to predict the time it takes to pass testing. Optimal algorithms will contribute to faster testing, resulting in lower carbon dioxide emissions without reducing Mercedes-Benz’s standards.

**Tasks** -
- If for any column(s), the variance is equal to zero, then you need to remove those variable(s).
- Check for null and unique values for test and train sets.
- Apply label encoder.
- Perform dimensionality reduction.
- Predict your test_df values using XGBoost.

In [136]:
# Libraries

import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error as MSE

import warnings
warnings.filterwarnings('ignore')

In [137]:
# Importing data into the python environment

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [138]:
print("Train data:", train_df.shape)
train_df.head()

Train data: (4209, 378)


Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [139]:
print("Test data:", test_df.shape)
test_df.head()

Test data: (4209, 377)


Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


In [140]:
# Dropping 0 variance columns

zero_var_columns = train_df.var().where(lambda x: x==0).dropna().index.to_list()
print("Zero variance columns:", zero_var_columns)

train_df = train_df.drop(zero_var_columns, axis=1)
test_df = test_df.drop(zero_var_columns, axis=1)


Zero variance columns: ['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347']


In [141]:
# Checking null values in the data

print("Null values in train_df:", train_df.isnull().sum().where(lambda x: x!=0).dropna().index.to_list())
print("Null values in test_df:", test_df.isnull().sum().where(lambda x: x!=0).dropna().index.to_list())

Null values in train_df: []
Null values in test_df: []


In [142]:
# Checking unique values in the data

print("Unique values in train_df: \n", pd.Series({col:train_df[col].unique() for col in train_df}))
print("Unique values in test_df: \n", pd.Series({col:test_df[col].unique() for col in test_df}))

Unique values in train_df: 
 ID      [0, 6, 7, 9, 13, 18, 24, 25, 27, 30, 31, 32, 3...
y       [130.81, 88.53, 76.26, 80.62, 78.02, 92.93, 12...
X0      [k, az, t, al, o, w, j, h, s, n, ay, f, x, y, ...
X1      [v, t, w, b, r, l, s, aa, c, a, e, h, z, j, o,...
X2      [at, av, n, e, as, aq, r, ai, ak, m, a, k, ae,...
                              ...                        
X380                                               [0, 1]
X382                                               [0, 1]
X383                                               [0, 1]
X384                                               [0, 1]
X385                                               [0, 1]
Length: 366, dtype: object
Unique values in test_df: 
 ID      [1, 2, 3, 4, 5, 8, 10, 11, 12, 14, 15, 16, 17,...
X0      [az, t, w, y, x, f, ap, o, ay, al, h, z, aj, d...
X1      [v, b, l, s, aa, r, a, i, p, c, o, m, z, e, h,...
X2      [n, ai, as, ae, s, b, e, ak, m, a, aq, ag, r, ...
X3                                  [f, a, c, 

In [143]:
# Separating the target variable from the rest of the train data

train_data = train_df.copy()
train_data_X = train_data.drop(['ID', 'y'], axis=1)
train_data_y = train_data['y']

print("Target Variable - Training data")
print(train_data_y.shape)

print("\nFeature Variables - Training data")
print(train_data_X.shape)

test_data = test_df.copy()
test_data_X = test_data.drop(['ID'], axis=1)

print("\nFeature Variables - Test data")
print(test_data_X.shape)

Target Variable - Training data
(4209,)

Feature Variables - Training data
(4209, 364)

Feature Variables - Test data
(4209, 364)


In [144]:
# Label Encoder
# Applying label encoder to both train and test data

le = LabelEncoder()
test_data_X = test_data_X.apply(le.fit_transform)

train_data_X = train_data_X.apply(le.fit_transform)
train_data_X

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X12,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,32,23,17,0,3,24,9,14,0,0,...,0,0,1,0,0,0,0,0,0,0
1,32,21,19,4,3,28,11,14,0,0,...,1,0,0,0,0,0,0,0,0,0
2,20,24,34,2,3,27,9,23,0,0,...,0,0,0,0,0,0,1,0,0,0
3,20,21,34,5,3,27,11,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20,23,34,5,3,12,3,13,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,8,20,16,2,3,0,3,16,0,0,...,1,0,0,0,0,0,0,0,0,0
4205,31,16,40,3,3,0,7,7,0,0,...,0,1,0,0,0,0,0,0,0,0
4206,8,23,38,0,3,0,6,4,0,1,...,0,0,1,0,0,0,0,0,0,0
4207,9,19,25,5,3,0,11,20,0,0,...,0,0,0,0,0,0,0,0,0,0


In [145]:
# Performing Dimensionality Reduction

pca = PCA(n_components=30)

train_data_X_pca = pd.DataFrame(pca.fit_transform(train_data_X))
print("Training data shape after dimensionality reduction: ", train_data_X_pca.shape)

test_data_X_pca = pd.DataFrame(pca.transform(train_data_X))
print("Testing data shape after dimensionality reduction: ", test_data_X_pca.shape)

Training data shape after dimensionality reduction:  (4209, 30)
Testing data shape after dimensionality reduction:  (4209, 30)


In [146]:
# Splitting the data into training and validation set
train_X, validation_X, train_y, validation_y = train_test_split(train_data_X_pca, train_data_y, test_size=0.2, random_state=42)

# Train and test set are converted to DMatrix objects,
# as it is required by learning API.
train_dmatrix = xgb.DMatrix(data = train_X, label = train_y)
validation_dmatrix = xgb.DMatrix(data = validation_X, label = validation_y)

In [147]:
# Model Training - XGBoost

# Parameter dictionary specifying base learner
param = {"booster":"gblinear", "objective":"reg:squarederror"}

# Training the model
xgb_r = xgb.train(params = param, dtrain = train_dmatrix, num_boost_round = 10)
pred = xgb_r.predict(validation_dmatrix)
  
# RMSE Computation
rmse = np.sqrt(MSE(validation_y, pred))
print("Root Mean Squared Error (RMSE) : % f" %(rmse))


Root Mean Squared Error (RMSE) :  8.513366


In [148]:
# Predicting test_df with the model

test_dmatrix = xgb.DMatrix(data = test_data_X_pca)
test_preds = xgb_r.predict(test_dmatrix)
print("Predicted rows: ", test_preds.shape)

test_df['y'] = test_preds
test_df.head()

Predicted rows:  (4209,)


Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X376,X377,X378,X379,X380,X382,X383,X384,X385,y
0,1,az,v,n,f,d,t,a,w,0,...,0,0,1,0,0,0,0,0,0,100.913612
1,2,t,b,ai,a,d,b,g,y,0,...,0,1,0,0,0,0,0,0,0,95.306137
2,3,az,v,as,f,d,a,j,j,0,...,0,0,1,0,0,0,0,0,0,82.258675
3,4,az,l,n,f,d,z,l,n,0,...,0,0,1,0,0,0,0,0,0,77.352768
4,5,w,s,as,c,d,y,i,m,0,...,0,0,0,0,0,0,0,0,0,77.151947


In [149]:
# saving the predictions to a csv file
test_df.to_csv('submission.csv', index=False)