### 1. Loading Relevant Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
%matplotlib inline

### 2. Importing train & test dataset

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
#drop the ID column as index as it is not needed for prediction
df_train.drop('ID',inplace=True,axis=1)
df_test.drop('ID',inplace=True,axis=1)

In [4]:
print(df_train.columns)
print(df_test.columns)

Index(['y', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8', 'X10',
       ...
       'X375', 'X376', 'X377', 'X378', 'X379', 'X380', 'X382', 'X383', 'X384',
       'X385'],
      dtype='object', length=377)
Index(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8', 'X10', 'X11',
       ...
       'X375', 'X376', 'X377', 'X378', 'X379', 'X380', 'X382', 'X383', 'X384',
       'X385'],
      dtype='object', length=376)


In [5]:
print(df_train.shape)
print(df_test.shape)

(4209, 377)
(4209, 376)


### 3. Columns having variance == zero, and remove those variable(s).

In [6]:
Zero_var_col = df_train.var()[df_train.var()==0].index.values
Zero_var_col

array(['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290',
       'X293', 'X297', 'X330', 'X347'], dtype=object)

In [7]:
# Deleting the lables having var == 0
df_train.drop(Zero_var_col,axis = 1,inplace=True)
df_test.drop(Zero_var_col,axis = 1,inplace=True)

In [8]:
# Cross check
df_train.columns

Index(['y', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8', 'X10',
       ...
       'X375', 'X376', 'X377', 'X378', 'X379', 'X380', 'X382', 'X383', 'X384',
       'X385'],
      dtype='object', length=365)

### 4. Check for null and unique values for test and train sets.

##### a. Checking null values

In [9]:
np.sum(df_train.isna().sum())

0

In [10]:
np.sum(df_test.isna().sum())

0

##### b. Checking unique values 

Unique value of all the columns are verified as under, it can be observed that except for first 10 columns all other columns have only 0 & 1

In [11]:
for i in df_train.columns:
    print(df_train[i].unique())

[130.81  88.53  76.26 ...  85.71 108.77  87.48]
['k' 'az' 't' 'al' 'o' 'w' 'j' 'h' 's' 'n' 'ay' 'f' 'x' 'y' 'aj' 'ak' 'am'
 'z' 'q' 'at' 'ap' 'v' 'af' 'a' 'e' 'ai' 'd' 'aq' 'c' 'aa' 'ba' 'as' 'i'
 'r' 'b' 'ax' 'bc' 'u' 'ad' 'au' 'm' 'l' 'aw' 'ao' 'ac' 'g' 'ab']
['v' 't' 'w' 'b' 'r' 'l' 's' 'aa' 'c' 'a' 'e' 'h' 'z' 'j' 'o' 'u' 'p' 'n'
 'i' 'y' 'd' 'f' 'm' 'k' 'g' 'q' 'ab']
['at' 'av' 'n' 'e' 'as' 'aq' 'r' 'ai' 'ak' 'm' 'a' 'k' 'ae' 's' 'f' 'd'
 'ag' 'ay' 'ac' 'ap' 'g' 'i' 'aw' 'y' 'b' 'ao' 'al' 'h' 'x' 'au' 't' 'an'
 'z' 'ah' 'p' 'am' 'j' 'q' 'af' 'l' 'aa' 'c' 'o' 'ar']
['a' 'e' 'c' 'f' 'd' 'b' 'g']
['d' 'b' 'c' 'a']
['u' 'y' 'x' 'h' 'g' 'f' 'j' 'i' 'd' 'c' 'af' 'ag' 'ab' 'ac' 'ad' 'ae'
 'ah' 'l' 'k' 'n' 'm' 'p' 'q' 's' 'r' 'v' 'w' 'o' 'aa']
['j' 'l' 'd' 'h' 'i' 'a' 'g' 'c' 'k' 'e' 'f' 'b']
['o' 'x' 'e' 'n' 's' 'a' 'h' 'p' 'm' 'k' 'd' 'i' 'v' 'j' 'b' 'q' 'w' 'g'
 'y' 'l' 'f' 'u' 'r' 't' 'c']
[0 1]
[0 1]
[1 0]
[0 1]
[0 1]
[0 1]
[0 1]
[1 0]
[0 1]
[0 1]
[1 0]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]


### 4. Apply label encoder on train & test datasets

In [12]:
# identifying categorical columns
label_columns = df_train.describe(include=['object']).columns.values
label_columns

array(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'], dtype=object)

In [13]:
le = LabelEncoder()

for col in label_columns:
    le.fit(df_train[col].append(df_test[col]).values)
    df_train[col]= le.transform(df_train[col])
    df_test[col]= le.transform(df_test[col])

In [14]:
df_train.columns

Index(['y', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8', 'X10',
       ...
       'X375', 'X376', 'X377', 'X378', 'X379', 'X380', 'X382', 'X383', 'X384',
       'X385'],
      dtype='object', length=365)

In [15]:
# cross checking if lables are encoded to numbers
df_train['X0'].unique()

array([37, 24, 46, 11, 41, 49, 36, 34, 45, 40, 23, 32, 50, 51,  9, 10, 12,
       52, 43, 18, 15, 48,  6,  0, 31,  8, 30, 16, 29,  1, 26, 17, 35, 44,
       25, 22, 28, 47,  4, 19, 39, 38, 21, 14,  3, 33,  2], dtype=int64)

### 5. Performing dimentionality reduction

##### Importing package & initializing the same

In [16]:
from sklearn.decomposition import PCA 
pca = PCA(n_components = 0.98,svd_solver='full')

X = df_train.drop('y',axis=1)
y = df_train['y']


In [17]:
# Performing train test & split
X_train , X_val , y_train , y_val = train_test_split(X,y,test_size=0.2,random_state=42)

In [18]:
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=0.98, random_state=None,
  svd_solver='full', tol=0.0, whiten=False)

In [19]:
pca.n_components_

12

In [20]:
pca.explained_variance_ratio_

array([0.40868988, 0.21758508, 0.13120081, 0.10783522, 0.08165248,
       0.0140934 , 0.00660951, 0.00384659, 0.00260289, 0.00214378,
       0.00209857, 0.00180388])

### 6. Predicting using XGBoost

In [21]:
pca_X_train = pd.DataFrame(pca.transform(X_train))
pca_X_val = pd.DataFrame(pca.transform(X_val)) 
pca_test = pd.DataFrame(pca.transform(df_test))

In [22]:
# Initialising the model
model = xgb.XGBRegressor(objective='reg:linear',learning_rate=0.1)

In [23]:
#Fitting the model
model.fit(pca_X_train,y_train)

  if getattr(data, 'base', None) is not None and \




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)

In [24]:
#predict on the validaiton set
pred_y_val = model.predict(pca_X_val)

In [25]:
#capture the MSE to get a measure of how accurate the predictions were
mse_score = mean_squared_error(y_val,pred_y_val)

In [26]:
#print the MSE score
print(mse_score)

84.22703928013917
