In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [3]:
train_df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [4]:
test_df.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


### Checking the variance of the columns and removing the variables

In [5]:
train_df.var()

ID      5.941936e+06
y       1.607667e+02
X10     1.313092e-02
X11     0.000000e+00
X12     6.945713e-02
            ...     
X380    8.014579e-03
X382    7.546747e-03
X383    1.660732e-03
X384    4.750593e-04
X385    1.423823e-03
Length: 370, dtype: float64

In [6]:
test_df.var()==0

ID      False
X10     False
X11     False
X12     False
X13     False
        ...  
X380    False
X382    False
X383    False
X384    False
X385    False
Length: 369, dtype: bool

In [7]:
print(train_df.shape)
print(test_df.shape)

(4209, 378)
(4209, 377)


In [8]:
train_df.var()[train_df.var()==0].index

Index(['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293',
       'X297', 'X330', 'X347'],
      dtype='object')

In [9]:
train_df_drop_col = train_df.drop(train_df.var()[train_df.var() == 0].index.values , axis=1)
test_df_drop_col = test_df.drop(test_df.var()[test_df.var() == 0].index.values , axis=1)

print(train_df_drop_col.shape)
print(test_df_drop_col.shape)

(4209, 366)
(4209, 372)


### Check for null and unique values for test and train sets.

In [10]:
features_with_na_train=[features for features in train_df_drop_col if train_df_drop_col[features].isnull().sum()>0]
features_with_na_train

[]

In [11]:
train_df_drop_col.isna().sum()

ID      0
y       0
X0      0
X1      0
X2      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 366, dtype: int64

In [12]:
test_df_drop_col.isna().sum()

ID      0
X0      0
X1      0
X2      0
X3      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 372, dtype: int64

In [13]:
features_with_na_test=[features for features in test_df_drop_col if test_df_drop_col[features].isnull().sum()>0]
features_with_na_test

[]

In [14]:
for column_name in train_df_drop_col:
    print(train_df_drop_col[column_name].unique())


[   0    6    7 ... 8412 8415 8417]
[130.81  88.53  76.26 ...  85.71 108.77  87.48]
['k' 'az' 't' 'al' 'o' 'w' 'j' 'h' 's' 'n' 'ay' 'f' 'x' 'y' 'aj' 'ak' 'am'
 'z' 'q' 'at' 'ap' 'v' 'af' 'a' 'e' 'ai' 'd' 'aq' 'c' 'aa' 'ba' 'as' 'i'
 'r' 'b' 'ax' 'bc' 'u' 'ad' 'au' 'm' 'l' 'aw' 'ao' 'ac' 'g' 'ab']
['v' 't' 'w' 'b' 'r' 'l' 's' 'aa' 'c' 'a' 'e' 'h' 'z' 'j' 'o' 'u' 'p' 'n'
 'i' 'y' 'd' 'f' 'm' 'k' 'g' 'q' 'ab']
['at' 'av' 'n' 'e' 'as' 'aq' 'r' 'ai' 'ak' 'm' 'a' 'k' 'ae' 's' 'f' 'd'
 'ag' 'ay' 'ac' 'ap' 'g' 'i' 'aw' 'y' 'b' 'ao' 'al' 'h' 'x' 'au' 't' 'an'
 'z' 'ah' 'p' 'am' 'j' 'q' 'af' 'l' 'aa' 'c' 'o' 'ar']
['a' 'e' 'c' 'f' 'd' 'b' 'g']
['d' 'b' 'c' 'a']
['u' 'y' 'x' 'h' 'g' 'f' 'j' 'i' 'd' 'c' 'af' 'ag' 'ab' 'ac' 'ad' 'ae'
 'ah' 'l' 'k' 'n' 'm' 'p' 'q' 's' 'r' 'v' 'w' 'o' 'aa']
['j' 'l' 'd' 'h' 'i' 'a' 'g' 'c' 'k' 'e' 'f' 'b']
['o' 'x' 'e' 'n' 's' 'a' 'h' 'p' 'm' 'k' 'd' 'i' 'v' 'j' 'b' 'q' 'w' 'g'
 'y' 'l' 'f' 'u' 'r' 't' 'c']
[0 1]
[0 1]
[1 0]
[0 1]
[0 1]
[0 1]
[0 1]
[1 0]
[0 1]
[0 1]


In [15]:
for column_name in test_df_drop_col:
    print(test_df_drop_col[column_name].unique())

[   1    2    3 ... 8413 8414 8416]
['az' 't' 'w' 'y' 'x' 'f' 'ap' 'o' 'ay' 'al' 'h' 'z' 'aj' 'd' 'v' 'ak'
 'ba' 'n' 'j' 's' 'af' 'ax' 'at' 'aq' 'av' 'm' 'k' 'a' 'e' 'ai' 'i' 'ag'
 'b' 'am' 'aw' 'as' 'r' 'ao' 'u' 'l' 'c' 'ad' 'au' 'bc' 'g' 'an' 'ae' 'p'
 'bb']
['v' 'b' 'l' 's' 'aa' 'r' 'a' 'i' 'p' 'c' 'o' 'm' 'z' 'e' 'h' 'w' 'g' 'k'
 'y' 't' 'u' 'd' 'j' 'q' 'n' 'f' 'ab']
['n' 'ai' 'as' 'ae' 's' 'b' 'e' 'ak' 'm' 'a' 'aq' 'ag' 'r' 'k' 'aj' 'ay'
 'ao' 'an' 'ac' 'af' 'ax' 'h' 'i' 'f' 'ap' 'p' 'au' 't' 'z' 'y' 'aw' 'd'
 'at' 'g' 'am' 'j' 'x' 'ab' 'w' 'q' 'ah' 'ad' 'al' 'av' 'u']
['f' 'a' 'c' 'e' 'd' 'g' 'b']
['d' 'b' 'a' 'c']
['t' 'b' 'a' 'z' 'y' 'x' 'h' 'g' 'f' 'j' 'i' 'd' 'c' 'af' 'ag' 'ab' 'ac'
 'ad' 'ae' 'ah' 'l' 'k' 'n' 'm' 'p' 'q' 's' 'r' 'v' 'w' 'o' 'aa']
['a' 'g' 'j' 'l' 'i' 'd' 'f' 'h' 'c' 'k' 'e' 'b']
['w' 'y' 'j' 'n' 'm' 's' 'a' 'v' 'r' 'o' 't' 'h' 'c' 'k' 'p' 'u' 'd' 'g'
 'b' 'q' 'e' 'l' 'f' 'i' 'x']
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[

### Applying Label Encoder

In [16]:
train_df_drop_col

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,8405,107.39,ak,s,as,c,d,aa,d,q,...,1,0,0,0,0,0,0,0,0,0
4205,8406,108.77,j,o,t,d,d,aa,h,h,...,0,1,0,0,0,0,0,0,0,0
4206,8412,109.22,ak,v,r,a,d,aa,g,e,...,0,0,1,0,0,0,0,0,0,0
4207,8415,87.48,al,r,e,f,d,aa,l,u,...,0,0,0,0,0,0,0,0,0,0


In [17]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

In [18]:
train_features = train_df_drop_col.drop("y",axis = 1)
y_train = train_df_drop_col["y"]

In [19]:
X_train_label_encoded=train_features.apply(label_encoder.fit_transform)
X_train_label_encoded

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,32,23,17,0,3,24,9,14,0,...,0,0,1,0,0,0,0,0,0,0
1,1,32,21,19,4,3,28,11,14,0,...,1,0,0,0,0,0,0,0,0,0
2,2,20,24,34,2,3,27,9,23,0,...,0,0,0,0,0,0,1,0,0,0
3,3,20,21,34,5,3,27,11,4,0,...,0,0,0,0,0,0,0,0,0,0
4,4,20,23,34,5,3,12,3,13,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,4204,8,20,16,2,3,0,3,16,0,...,1,0,0,0,0,0,0,0,0,0
4205,4205,31,16,40,3,3,0,7,7,0,...,0,1,0,0,0,0,0,0,0,0
4206,4206,8,23,38,0,3,0,6,4,0,...,0,0,1,0,0,0,0,0,0,0
4207,4207,9,19,25,5,3,0,11,20,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
X_test_label_encoded=test_df_drop_col.apply(label_encoder.fit_transform)
X_test_label_encoded

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,21,23,34,5,3,26,0,22,0,...,0,0,0,1,0,0,0,0,0,0
1,1,42,3,8,0,3,9,6,24,0,...,0,0,1,0,0,0,0,0,0,0
2,2,21,23,17,5,3,0,9,9,0,...,0,0,0,1,0,0,0,0,0,0
3,3,21,13,34,5,3,31,11,13,0,...,0,0,0,1,0,0,0,0,0,0
4,4,45,20,17,2,3,30,8,12,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,4204,6,9,17,5,3,1,9,4,0,...,0,0,0,0,0,0,0,0,0,0
4205,4205,42,1,8,3,3,1,9,24,0,...,0,1,0,0,0,0,0,0,0,0
4206,4206,47,23,17,5,3,1,3,22,0,...,0,0,0,0,0,0,0,0,0,0
4207,4207,7,23,17,0,3,1,2,16,0,...,0,0,1,0,0,0,0,0,0,0


### Applying Dimensionality Reduction Using PCA

In [21]:
from sklearn.decomposition import PCA 

pca = PCA()

X_train_pca = pca.fit_transform(X_train_label_encoded)
X_train_pca

array([[-2.10392518e+03,  8.62388945e-01, -1.84271114e-01, ...,
        -9.80458428e-15,  9.82483033e-16,  8.21357706e-18],
       [-2.10290920e+03,  7.84932754e-01,  1.62709020e+00, ...,
         9.50178056e-14,  6.59142507e-14,  8.07794210e-16],
       [-2.10191260e+03,  1.64356376e+01,  1.23258860e+01, ...,
         2.63712195e-14, -8.57225154e-15, -1.09042012e-15],
       ...,
       [ 2.10192644e+03,  2.87670119e+01,  1.49202369e+01, ...,
         1.23593934e-13, -1.03138715e-14, -2.01236841e-16],
       [ 2.10292655e+03,  2.27279761e+01,  1.72985538e+00, ...,
         9.41615547e-14, -2.09215152e-14,  4.70288632e-18],
       [ 2.10392601e+03, -1.75454605e+01, -9.82681801e+00, ...,
        -1.24661234e-14,  5.81783202e-14,  8.86285363e-17]])

In [22]:
print(X_train_pca.shape)

(4209, 365)


In [23]:
print("The varaince of the resulting PC components are : ")
print(pca.explained_variance_ratio_)

The varaince of the resulting PC components are : 
[9.99659608e-01 1.38083753e-04 7.69770342e-05 4.40107808e-05
 3.31849481e-05 2.66009200e-05 5.73061934e-06 2.67845282e-06
 1.56203752e-06 1.05802559e-06 8.69673184e-07 8.51175070e-07
 7.32194816e-07 6.01103751e-07 5.33242231e-07 4.63034402e-07
 3.72740073e-07 3.44879436e-07 3.21997035e-07 2.83068830e-07
 2.55585326e-07 2.31969263e-07 2.24009902e-07 2.10430778e-07
 1.84605148e-07 1.73951010e-07 1.55695102e-07 1.49112506e-07
 1.35988598e-07 1.32568757e-07 1.28164240e-07 1.18615786e-07
 1.13303372e-07 1.07956082e-07 1.01596937e-07 9.63933157e-08
 8.94341956e-08 7.97400199e-08 7.72270274e-08 7.44833379e-08
 7.07897886e-08 6.70931168e-08 6.42211701e-08 6.33394701e-08
 6.28443767e-08 6.08236279e-08 5.50354934e-08 5.38430316e-08
 5.17805622e-08 5.09377636e-08 4.97711458e-08 4.80779646e-08
 4.66612734e-08 4.53820263e-08 4.29839488e-08 4.12915804e-08
 3.88307962e-08 3.85021592e-08 3.70761370e-08 3.65157851e-08
 3.55486692e-08 3.34676588e-08 3.2

In [24]:
pca1 = PCA(n_components= 12 )

X_train_pca1 = pca1.fit_transform(X_train_label_encoded)
print(X_train_pca1.shape)

(4209, 12)


In [25]:
pca1 = PCA(n_components= 80 )

X_test_pca1 = pca1.fit_transform(X_test_label_encoded)
print(X_train_pca1.shape)

(4209, 12)


### Predict your test_df values using XGBoost.

In [30]:
import xgboost as xgb

In [32]:
D_train = xgb.DMatrix(X_train_pca1 , label = y_train)
D_test = xgb.DMatrix(X_test_pca1)

In [33]:
param = {'eta': 0.02, 'max_depth' : 4, 'objective' :'multi:softmax' , 'num_class' : 3  }

In [34]:
xgb_model  = xgb.train(param, D_train)

XGBoostError: [06:31:46] C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/objective/multiclass_obj.cu:120: SoftmaxMultiClassObj: label must be in [0, num_class).