# Mercedes-Benz project

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
benz_train = pd.read_csv('train.csv')

benz_test = pd.read_csv('test.csv')

In [4]:
train_df = benz_train.copy()
test_df = benz_test.copy()

In [5]:
train_df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [6]:
test_df.head(2)

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 378 entries, ID to X385
dtypes: float64(1), int64(369), object(8)
memory usage: 12.1+ MB


1. float - 1
2. int - 369
3. object - 8

In [8]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 377 entries, ID to X385
dtypes: int64(369), object(8)
memory usage: 12.1+ MB


1. int - 369
2. object - 8

In [9]:
# Get the column names as sets
train_cols = set(train_df.columns)
test_cols = set(test_df.columns)

# Find the columns that are in df1 but not in df2
target = train_cols - test_cols

target

{'y'}

**'y' is the target variable**

In [10]:
# remove the traget variable from training df
y_train = train_df['y']
y_train

0       130.81
1        88.53
2        76.26
3        80.62
4        78.02
         ...  
4204    107.39
4205    108.77
4206    109.22
4207     87.48
4208    110.85
Name: y, Length: 4209, dtype: float64

In [11]:
# Remove 'y' from train_df
train_df.drop(columns='y', axis=1, inplace=True)
train_df.columns

Index(['ID', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8', 'X10',
       ...
       'X375', 'X376', 'X377', 'X378', 'X379', 'X380', 'X382', 'X383', 'X384',
       'X385'],
      dtype='object', length=377)

In [12]:
# rechek - set must be empty
target = set(train_df.columns) - set(test_df.columns)
target

set()

#### NA values

In [13]:
# checking null values
train_df.isna().sum()[train_df.isna().sum()>0]

Series([], dtype: int64)

**No null values**

In [14]:
# get numeric variables columns- int and float
train_num_df = train_df.select_dtypes(include='number').copy()

#### Remove where varaince is 0

In [15]:
zero_var_cols = train_num_df.columns[train_num_df.var()==0]
zero_var_cols

Index(['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293',
       'X297', 'X330', 'X347'],
      dtype='object')

In [16]:
print(train_df.shape)
print(test_df.shape)

train_df.drop(columns=zero_var_cols, axis=1, inplace=True)

test_df.drop(columns=zero_var_cols, axis=1, inplace=True)

print('#########################################################')
print(train_df.shape)
print(test_df.shape)

(4209, 377)
(4209, 377)
#########################################################
(4209, 365)
(4209, 365)


#### Check the unique values in each column

###### In Object variable

?? convert it to dataframe without na. Is there simple method, without for loop

#### Object variable

In [17]:
unique_val_dict = {}
uniqueVal_num = []
for cols in train_df.columns:
    unique_val_dict[cols] = train_df[cols].unique()
    uniqueVal_num.append(len(train_df[cols].unique()))

In [18]:
# check if there is only one entry (aredy checked before for variance)
min(uniqueVal_num)

2

In [19]:
train_df_dm = pd.get_dummies(train_df,drop_first=True,dtype=int).copy()

In [20]:
print(train_df.shape)
print(train_df_dm.shape)

(4209, 365)
(4209, 544)


In [21]:
test_df_dm = pd.get_dummies(test_df,drop_first=True,dtype=int).copy()

In [22]:
print(test_df.shape)
print(test_df_dm.shape)

(4209, 365)
(4209, 550)


In [23]:
print(train_df_dm.shape)
print(test_df_dm.shape)

(4209, 544)
(4209, 550)


There are uneven number of columns. <br>
check the columns which are unique to data frames. <br>
add the unique column to other dataframe.

In [24]:
# Columns present in train but not in test
train_dm_cols = set(train_df_dm.columns)
test_dm_cols = set(test_df_dm.columns)

# Find the columns that are in df1 but not in df2
cols_notInTest = train_dm_cols - test_dm_cols
cols_NotInTrain = test_dm_cols - train_dm_cols 

print(cols_notInTest)
print(cols_NotInTrain)

{'X2_o', 'X0_q', 'X0_aa', 'X2_l', 'X2_ar', 'X2_c', 'X5_u', 'X0_ac', 'X2_aa', 'X0_ab'}
{'X5_z', 'X0_p', 'X2_ax', 'X0_ag', 'X5_t', 'X0_bb', 'X2_w', 'X5_b', 'X0_av', 'X2_aj', 'X0_ae', 'X2_ad', 'X2_ab', 'X5_aa', 'X2_u', 'X0_an'}


In [25]:
# making train and test df same
test_df_dm[list(cols_notInTest)] = 0 
train_df_dm[list(cols_NotInTrain)] = 0

In [26]:
# Recheck - set must be empty
train_dm_cols = set(train_df_dm.columns)
test_dm_cols = set(test_df_dm.columns)

cols_notInTest = train_dm_cols - test_dm_cols

cols_notInTest

set()

In [27]:
print(train_df_dm.shape)
print(test_df_dm.shape)

(4209, 560)
(4209, 560)


Now I have same columns, but not in same order

##### Seperate the object variables 

In [28]:
train_df_dm.columns

Index(['ID', 'X10', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19',
       ...
       'X2_w', 'X5_b', 'X0_av', 'X2_aj', 'X0_ae', 'X2_ad', 'X2_ab', 'X5_aa',
       'X2_u', 'X0_an'],
      dtype='object', length=560)

In [29]:
# x_train, removing unncessary ID column
x_train = train_df_dm.drop(['ID'], axis=1)
x_test = test_df_dm.drop(['ID'], axis=1)

print(x_train.shape)
print(x_test.shape)

(4209, 559)
(4209, 559)


In [30]:
# x_test, removing unncessary ID column
test_ID = test_df['ID']

In [31]:
# object_cols_trn = train_df.select_dtypes(include='O').columns
# print(object_cols_trn)


# object_cosl_tst = test_df.select_dtypes(include='O').columns
# print(object_cosl_tst)

### Dimentionality reduction

In [32]:
from sklearn.decomposition import PCA

###### Need to get the columns in same order

In [33]:
# need to reindex the column so that there will not be mismatch between the column name.
x_train = x_train.reindex(sorted(x_train.columns), axis=1)
x_test = x_test.reindex(sorted(x_test.columns),axis=1)

In [34]:
# PCA
n = 10
pca = PCA(n_components=n, random_state=420)

pca_train = pca.fit_transform(x_train)
pca_test = pca.transform(x_test)

In [35]:
pca_train

array([[ 0.8170922 , -1.35926451,  1.93980338, ..., -0.99616517,
         0.20548763, -0.28926406],
       [-0.10149853, -1.29439531, -0.09148895, ..., -0.72926317,
        -0.02133291,  0.61680647],
       [-0.66852392, -2.43478081,  1.69808554, ...,  0.67068979,
        -0.93299104, -0.21619101],
       ...,
       [-1.03441763, -0.48418073,  1.80068791, ..., -2.01213795,
         1.69191594,  0.13462591],
       [ 0.39026624, -1.17253956, -3.10414196, ..., -0.89381021,
        -0.1981668 ,  0.5881941 ],
       [ 0.96356487, -0.9003372 , -0.90648918, ...,  1.98087962,
        -0.27203155, -0.01176265]])

In [36]:
pca_test

array([[-0.401256  , -2.85732946,  0.24490776, ...,  0.04480954,
        -1.72439204, -0.29399195],
       [ 3.8307679 ,  0.39083024,  1.47659678, ..., -1.23199923,
        -1.06581093,  0.16215339],
       [-1.31917026, -0.73310014,  0.47122988, ...,  0.68990007,
        -1.44161518,  0.36926825],
       ...,
       [-1.44146861,  0.55557245, -2.91831286, ...,  0.50473256,
        -0.475243  ,  0.44424245],
       [-2.33387597,  1.278664  ,  1.59549699, ..., -0.23753291,
        -0.09081795, -0.88683734],
       [ 2.02367305, -0.92685731, -0.54364858, ..., -0.71547076,
        -0.71010002, -0.07067852]])

In [37]:
import xgboost as xgb
from sklearn.metrics import r2_score

In [39]:
x_train, x_testSub, y_train, y_testSub = train_test_split(
        pca_train, 
        y_train, test_size=0.25, 
        random_state=100)

testSub - sub section of the same dataframe, selected for test

In [40]:
# input matrix
dmat_train = xgb.DMatrix(x_train, label=y_train) # check how lebel works (y is target variable)
dmat_testSub = xgb.DMatrix(x_testSub, label=y_testSub)

In [41]:
print(dmat_train)
print(dmat_testSub)

<xgboost.core.DMatrix object at 0x000002646277D010>
<xgboost.core.DMatrix object at 0x0000026453FF2E90>


In [43]:
dmat_test = xgb.DMatrix(pca_test) # without lebles, need to add the lebels later

In [53]:
params = {}
params['objective'] = 'reg:linear'
params['eta'] = 0.02
params['max_depth'] = 4


In [57]:
def xgboost_r2(preds, dtrain):
    labels = dtrain.get_label()
    return 'r2', r2_score(labels, preds)

In [59]:
watchlist = [(dmat_train, 'train'), (dmat_testSub, 'test1')]

clf = xgb.train(params, dmat_train, 
                1000, watchlist, early_stopping_rounds=50, feval=xgboost_r2,
                 maximize=True, verbose_eval=10)

[0]	train-rmse:98.75160	train-r2:-64.47981	test1-rmse:99.65705	test1-r2:-49.73367
[10]	train-rmse:80.90589	train-r2:-42.95208	test1-rmse:81.92900	test1-r2:-33.28905
[20]	train-rmse:66.35451	train-r2:-28.56378	test1-rmse:67.48379	test1-r2:-22.26372
[30]	train-rmse:54.49760	train-r2:-18.94224	test1-rmse:55.73750	test1-r2:-14.86994
[40]	train-rmse:44.84662	train-r2:-12.50450	test1-rmse:46.21290	test1-r2:-9.90955
[50]	train-rmse:36.99991	train-r2:-8.19222	test1-rmse:38.49123	test1-r2:-6.56840
[60]	train-rmse:30.64375	train-r2:-5.30526	test1-rmse:32.26935	test1-r2:-4.31937
[70]	train-rmse:25.51448	train-r2:-3.37112	test1-rmse:27.29177	test1-r2:-2.80490
[80]	train-rmse:21.39706	train-r2:-2.07417	test1-rmse:23.34246	test1-r2:-1.78338
[90]	train-rmse:18.11201	train-r2:-1.20268	test1-rmse:20.23912	test1-r2:-1.09249
[100]	train-rmse:15.51288	train-r2:-0.61586	test1-rmse:17.82374	test1-r2:-0.62285
[110]	train-rmse:13.47757	train-r2:-0.21967	test1-rmse:15.97359	test1-r2:-0.30342
[120]	train-rmse:1

In [62]:
#Predict test_df values - xgboost

p_test = clf.predict(dmat_test)

sub = pd.DataFrame()
sub['ID'] = test_ID
sub['y'] = p_test


In [63]:
sub

Unnamed: 0,ID,y
0,1,82.082756
1,2,93.808975
2,3,78.811989
3,4,83.335892
4,5,109.577301
...,...,...
4204,8410,105.182632
4205,8411,92.493462
4206,8413,95.453529
4207,8414,113.283005
