# Feature Engineering

## 1.) Import Necessary Packages

In [72]:
# Importing necessary packages

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import statsmodels.api as sm
import pylab as py
from sklearn.preprocessing import StandardScaler
from scipy.stats import pointbiserialr

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA

from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from tqdm import tqdm 
from prettytable import PrettyTable

## 2.) Load & Prepare The Data

In [73]:
# Load the train and test data

train_raw = pd.read_csv('Data/train_data/train.csv')
test_raw = pd.read_csv('Data/test_data/test.csv')

In [74]:
# Seperating target variable from the rest of the features

y_raw = train_raw['y']
x_raw = train_raw.drop('y', axis=1)

In [75]:
# Storing categorical features in a seperate variable

categorical_features = ['X0','X1','X2','X3','X4','X5','X6','X8']
train_categorical = train_raw[categorical_features]
test_categorical = test_raw[categorical_features]

In [76]:
# Perform one hot encoding for categorical variables

encoder = OneHotEncoder(handle_unknown='ignore')
encoder = encoder.fit(train_categorical)
train_categorical_ohe = encoder.transform(train_categorical)
test_categorical_ohe = encoder.transform(test_categorical)

In [77]:
# Check the shape of the one hot encoded categorical feature datframe

print('Train data shape',train_categorical_ohe.toarray().shape)
print('Train data shape',test_categorical_ohe.toarray().shape)

Train data shape (4209, 195)
Train data shape (4209, 195)


In [78]:
# store the binary features in a new variable

train_binary_features = train_raw.iloc[:,10:]

## 3.) Feature Set 1 : Original Features With Zero Tempering

### 3.1) Concatanate All the Features -  Binary, Categorical and ID

In [79]:
# Concatanate all the features

train_data_1 = np.hstack((np.array(train_raw.ID).reshape(-1,1),train_categorical_ohe.toarray(),train_binary_features))
print('shape of the training data for Feature set 1: ',train_data_1.shape)

shape of the training data for Feature set 1:  (4209, 564)


### 3.2) RF Model Using Feature Set 1

#### 3.2.1) Fit the Model

In [80]:
# Fit a Randomforestregressor on Feature set 1

rf = RandomForestRegressor(n_estimators=20, max_depth=3, random_state=3)
rf.fit(train_data_1,y_raw)

RandomForestRegressor(max_depth=3, n_estimators=20, random_state=3)

#### 3.2.2) Get R2 score

In [81]:
# Get R2 score and adjusted R2 score for training

r2_training1 =  round(r2_score(y_raw,rf.predict(train_data_1)),3)
adjusted_r2_training1 = round(1 - (((1 - r2_training)*(len(y_raw)-1))/(len(y_raw) - train_data_1.shape[1] -1)),3)


print('R2 for training data set 1: ', r2_training1)
print('Adjusted R2 for training data set 1: ', adjusted_r2_training1)

R2 for training data set 1:  0.576
Adjusted R2 for training data set 1:  0.544


#### 3.2.3) Get Cross_val R2 score

In [82]:
# Get Cross_val R2 score and adjusted R2 score

r2_cv1 = round(np.mean(cross_val_score(rf,train_data_1,y_raw,cv=4, scoring='r2')),3)
adjusted_r2_cv1 =  round(1 - (1 - r2_cv)*(len(y_raw)-1)/(len(y_raw) - train_data_1.shape[1] -1),3)

print('R-squared cross validation: ', r2_cv1)
print('Adjusted R-squared cross validation: ', adjusted_r2_cv1)

R-squared cross validation:  0.512
Adjusted R-squared cross validation:  0.46


## 4.) Feature Set 2 : Remove Features With Very Small Individual R2

### 4.1) Discover the Least Scoring features individually

#### 4.1.1)  Use RF to get R2 scores for individual features

In [83]:
# find the R2 score when using only all the binary features.
# find the individual R2 score for each binary feature.

rf = RandomForestRegressor(n_estimators=20, max_depth=3, random_state=3) 
rf.fit(train_binary_features,y_raw)
y_pred_train = rf.predict(train_binary_features)
train_score = r2_score(y_raw,y_pred_train)
print('R2 score when we use all the binary features: ',round(train_score,4))

R2_binary_fea = {} # store the individual R2 scores of binary features in R2_binary_fea
for i in train_binary_features.columns:
    tr_data = np.array(train_binary_features[i]).reshape(-1,1)
    rf = RandomForestRegressor(n_estimators=20, max_depth=3, random_state=3)
    rf.fit(tr_data,y_raw)
    y_pred_train = rf.predict(tr_data)
    train_score = r2_score(y_raw,y_pred_train)
    R2_binary_fea[i] = train_score

R2 score when we use all the binary features:  0.5745


#### 4.1.2) Collect the least scoring features names

In [84]:
# Get binary features with a negative or very small R-squared score
binary_features_small_r2 = []
for i in (sorted(R2_binary_fea.items(), key = lambda item:item[1])):
  if i[1] < 0.0005:                 # this threshold was chosen intuitively with some trial and error 
        binary_features_small_r2.append(i[0])

### 4.2) Prepare data for Feature Set 2

In [85]:
# Prepare dataset for set 2
# Remove 'X4' because it has very small variance as seen during EDA.

categorical_features_set_2 = ['X0','X1','X2','X3','X5','X6','X8']   # 'X4' not included
train_categorical_set_2 = train_raw[categorical_features_set_2]

# perform one hot encoding for remaining categorical features.
encoder = OneHotEncoder(handle_unknown='ignore')
encoder = encoder.fit(train_categorical_set_2)
train_categorical_ohe_set_2 = encoder.transform(train_categorical_set_2)

# Drop the binary features having a very small individual R2.
train_binary_features_set_2 = train_binary_features.drop(binary_features_small_r2, axis=1)

#concatanate ID, Categorical Features, Binary Features
train_data_2 = np.hstack((np.array(train_raw.ID).reshape(-1,1),train_categorical_ohe_set_2.toarray(),train_binary_features_set_2))
print('shape of the set 2 train data: ',train_data_2.shape)

shape of the set 2 train data:  (4209, 460)


### 4.3) RF Model Using Feature Set 2

#### 4.3.1) Fit the RF

In [86]:
# Fit a RF regressor on Feature Set 2

rf = RandomForestRegressor(n_estimators=20, max_depth=3, random_state=3)
rf.fit(train_data_2,y_raw)

RandomForestRegressor(max_depth=3, n_estimators=20, random_state=3)

#### 4.3.2) Get R2 score for Feature set 2

In [87]:
# R2 score for training data set 2

r2_training2 =  round(r2_score(y_raw,rf.predict(train_data_2)),3)
adjusted_r2_training2 = round(1 - (((1 - r2_training)*(len(y_raw)-1))/(len(y_raw) - train_data_2.shape[1] -1)),3)

print('R-squared for training data set 2: ', r2_training2)
print('Adjusted R-squared for training data set 2: ', adjusted_r2_training2)

R-squared for training data set 2:  0.576
Adjusted R-squared for training data set 2:  0.557


#### 4.3.3) Get Cross_val R2 score for set 2

In [88]:
# corss_val R2 score for set 2

r2_cv2 = round(np.mean(cross_val_score(rf,train_data_2,y_raw,cv=4, scoring='r2')),3)
adjusted_r2_cv2 =  round(1 - (((1 - r2_cv)*(len(y_raw)-1))/(len(y_raw) - train_data_2.shape[1] -1)),3)

print('cross validation R-squared  for set 2: ', r2_cv2)
print('cross validation Adjusted R-squared  for set 2: ',adjusted_r2_cv2)

cross validation R-squared  for set 2:  0.511
cross validation Adjusted R-squared  for set 2:  0.475


## 5.) Feature Set 3 : Remove the outliers

### 5.1) Prepare the data for Model3

#### 5.1.1) Remove the Outlier

In [89]:
# Remove the extreme outlier.

train_categorical_ohe_set_3 = np.delete(train_categorical_ohe_set_2.toarray(),883,0)
train_binary_features_set_3 = train_binary_features_set_2.drop(labels=883, axis=0)
ID_set_3 = np.array(train_raw.ID.drop(labels=883, axis=0)).reshape(-1,1)
y_set_3 = train_raw['y'].drop(labels=883, axis=0)

#### 5.1.2) Concatanate all features

In [90]:
# Concatanate all categorical,ID, Binary features after removing the outlier

train_data_3 = np.hstack((ID_set_3,train_categorical_ohe_set_3,train_binary_features_set_3))
print('shape of the Set 3 train data: ',train_data_3.shape)

shape of the Set 3 train data:  (4208, 460)


### 5.2) RF Regression Using Set 3

#### 5.2.1) Build the RF Regressor

In [91]:
# Fit a RF regressor on Feature Set 3 

rf = RandomForestRegressor(n_estimators=20, max_depth=3, random_state=3)
rf.fit(train_data_3,y_set_3)

RandomForestRegressor(max_depth=3, n_estimators=20, random_state=3)

#### 5.2.2) Get R2 score for Feature Set 3

In [92]:
# R2 score for training Feature set 3

r2_training3 =  round(r2_score(y_set_3,rf.predict(train_data_3)),3)
adjusted_r2_training3 = round(1 - (((1 - r2_training)*(len(y_set_3)-1))/(len(y_set_3) - train_data_3.shape[1] -1)),3)

print('R-squared for training data set 3: ', r2_training3)
print('Adjusted R-squared for training data set 3: ', adjusted_r2_training3)

R-squared for training data set 3:  0.603
Adjusted R-squared for training data set 3:  0.557


#### 5.2.3) Get Cross_val R2 Score for Set 3

In [93]:
# corss_val R2 score for set 3

r2_cv3 = round(np.mean(cross_val_score(rf,train_data_3,y_set_3,cv=4, scoring='r2')),3)
adjusted_r2_cv3 =  round(1 - (((1 - r2_cv)*(len(y_set_3)-1))/(len(y_set_3) - train_data_3.shape[1] -1)),3)

print('R-squared cross validation  for data set 3: ', r2_cv3)
print('Adjusted R-squared cross validation  for data set 3: ', adjusted_r2_cv3)

R-squared cross validation  for data set 3:  0.523
Adjusted R-squared cross validation  for data set 3:  0.475


## 6.) Feature Set 4: Remove highly correlated features

### 6.1) Get correlated features

#### 6.1.1) Get correlation matrix

In [94]:
# Correlation matrix using corr()
corr_matrix = train_binary_features_set_3.corr()
corr_matrix.head(5)

Unnamed: 0,X10,X12,X13,X14,X15,X16,X17,X19,X20,X21,...,X372,X373,X375,X376,X377,X378,X379,X380,X382,X383
X10,1.0,-0.033092,-0.028813,-0.100507,-0.002532,-0.005946,-0.010166,-0.038569,-0.047406,-0.005946,...,-0.002532,-0.01627,0.165268,-0.028625,-0.074267,-0.016874,-0.011377,-0.010482,-0.010166,-0.004741
X12,-0.033092,1.0,0.214812,-0.246596,-0.006214,-0.014588,-0.024943,-0.094629,-0.116311,-0.014588,...,-0.006214,0.176652,-0.107917,-0.070232,0.03009,-0.016053,-0.027914,-0.005572,-0.024943,-0.011631
X13,-0.028813,0.214812,1.0,-0.0832,-0.00541,-0.012701,-0.021718,-0.082394,-0.043152,0.007212,...,-0.00541,-0.034758,-0.169772,-0.061151,0.357211,-0.036048,-0.024305,0.02304,-0.021718,-0.010127
X14,-0.100507,-0.246596,-0.0832,1.0,-0.018872,-0.044305,0.012696,-0.287408,-0.353259,-0.044305,...,-0.018872,-0.054831,0.118827,0.026448,-0.097617,-0.037991,0.103299,0.007726,0.012696,0.023598
X15,-0.002532,-0.006214,-0.00541,-0.018872,1.0,-0.001116,-0.001909,-0.007242,-0.008901,-0.001116,...,-0.000476,-0.003055,-0.014922,-0.005375,0.032166,-0.003168,-0.002136,-0.001968,-0.001909,-0.00089


#### 6.1.2) Remove highly correlated features

In [95]:
# Filter out features with correlation higher than 0.95

corr_fea = []
for row_number in range (corr_matrix.shape[0]):
    for column_number in range (row_number):
        if corr_matrix.iloc[row_number,column_number]>0.95:   # this correlation threshold has been chosen intuitively
            corr_fea.append(corr_matrix.index[row_number])
corr_fea = set(corr_fea)
train_binary_features_set_4 = train_binary_features_set_3.drop(corr_fea,axis=1)

# Concatanate the ID, Categorical, Binary features
train_data_4 = np.hstack((ID_set_3,train_categorical_ohe_set_3,train_binary_features_set_4))
print('shape of the Set 4 train data: ',train_data_4.shape)

shape of the Set 4 train data:  (4208, 406)


### 6.2) RF Regression Using Set 4

 #### 6.2.1) Build the RF Regressor

In [96]:
# Fit a RF Regressor on Set 4
rf = RandomForestRegressor(n_estimators=20, max_depth=3, random_state=3)
rf.fit(train_data_4,y_set_3)

RandomForestRegressor(max_depth=3, n_estimators=20, random_state=3)

#### 6.2.2) Get R2 score

In [97]:
# Get the R2 Score for Feature Set 4

r2_training4 =  round(r2_score(y_set_3,rf.predict(train_data_4)),3)
adjusted_r2_training4 = round(1 - (((1 - r2_training)*(len(y_set_3))-1)/(len(y_set_3) - train_data_4.shape[1] -1)),3)

print('R-squared for training data set 4: ', r2_training4)
print('Adjusted R-squared for training data set 4: ', adjusted_r2_training4)

R-squared for training data set 4:  0.597
Adjusted R-squared for training data set 4:  0.563


#### 6.2.2) Get cross_val R2 Score

In [98]:
# Get the cross-val R2 Score for Feature Set 4

r2_cv4 = round(np.mean(cross_val_score(rf,train_data_4,y_set_3,cv=4, scoring='r2')),3)
adjusted_r2_cv4 =  round(1 - (((1 - r2_cv)*(len(y_set_3)-1))/(len(y_set_3) - train_data_4.shape[1] -1)),3)

print('R-squared cross validation for set 4: ', r2_cv4)
print('Adjusted R-squared cross validation  for set 4: ',adjusted_r2_cv4)

R-squared cross validation for set 4:  0.505
Adjusted R-squared cross validation  for set 4:  0.483


## 7.) Feature Set 5 - Create Interaction Features

### 7.1) Get Interaction Features

In [99]:
# Get two way and three way interactions for binary features
# Get point biserial correlation scores for all interactions

R2_int_fea = {}
for i in tqdm(range(train_binary_features_set_4.shape[1])):
    for j in range(i+1, train_binary_features_set_4.shape[1]):
        int_fea = train_binary_features_set_4[train_binary_features_set_4.columns[i]]+train_binary_features_set_4[train_binary_features_set_4.columns[j]]
        pbc = pointbiserialr(int_fea,y_set_3)
        R2_int_fea[train_binary_features_set_4.columns[i]+'+'+train_binary_features_set_4.columns[j]] = pbc[0]
        for k in range(j+1, train_binary_features_set_4.shape[1]):
            int_fea = train_binary_features_set_4[train_binary_features_set_4.columns[i]]+\
                      train_binary_features_set_4[train_binary_features_set_4.columns[j]]+\
                      train_binary_features_set_4[train_binary_features_set_4.columns[k]]
            pbc = pointbiserialr(int_fea,y_set_3)
            R2_int_fea[train_binary_features_set_4.columns[i]+'+'+train_binary_features_set_4.columns[j]+\
                     '+'+train_binary_features_set_4.columns[k]] = pbc[0]

100%|████████████████████████████████████████████████████████████████████████████████| 214/214 [10:22<00:00,  2.91s/it]


### 7.2) Seperate Best interaction features

#### 7.2.1) Sort the interaction features

In [100]:
# Point biserial ranges from -1 to +1, 0 means no correlation.
Int_fea_best = []
for i in (sorted(R2_int_fea.items(), key = lambda item:item[1])):
  if abs(i[1]) > 0.70:  # this threshold value was chosen intuitively with trial and eror
        Int_fea_best.append(i[0])
print("Interaction features that have highest point biserial scores: ", Int_fea_best[-3:])

Interaction features that have highest point biserial scores:  ['X127+X166+X276', 'X127+X166+X272', 'X136+X261+X315']


#### 7.2.2) Concatanate the Features

In [101]:
# Include the interaction featueres in the data set
train_binary_features_set_5 = train_binary_features_set_4
train_binary_features_set_5['X130+X261+X315'] = train_binary_features_set_4['X130'] + train_binary_features_set_4['X261']+train_binary_features_set_4['X315']
train_binary_features_set_5['X136+X179+X261'] = train_binary_features_set_4['X136'] + train_binary_features_set_4['X179']+train_binary_features_set_4['X261']
train_binary_features_set_5['X136+X261+X315'] = train_binary_features_set_4['X136'] + train_binary_features_set_4['X261']+train_binary_features_set_4['X315']

train_data_5 = np.hstack((ID_set_3,train_categorical_ohe_set_3,train_binary_features_set_5))
print('shape of the Set 5 train data: ',train_data_5.shape)

shape of the Set 5 train data:  (4208, 409)


### 7.3) RF Using Set 5

 #### 7.3.1) Build the RF Regressor

In [102]:
# Fit a RF Regressor on Set 5

rf = RandomForestRegressor(n_estimators=20, max_depth=3, random_state=3)
rf.fit(train_data_5,y_set_3)

RandomForestRegressor(max_depth=3, n_estimators=20, random_state=3)

#### 7.3.2) Get R2 score

In [103]:
# Get the R2 Score for Feature Set 5

r2_training5 =  round(r2_score(y_set_3,rf.predict(train_data_5)),3)
adjusted_r2_training5 = round(1 - (((1 - r2_training)*(len(y_set_3)-1))/(len(y_set_3) - train_data_5.shape[1] -1)),3)

print('R-squared for training data set 5: ', r2_training5)
print('Adjusted R-squared for training data set 5: ', adjusted_r2_training5)


R-squared for training data set 5:  0.605
Adjusted R-squared for training data set 5:  0.563


#### 7.3.2) Get cross_val R2 Score

In [104]:
# Get the cross_val R2 Score for Feature Set 5

r2_cv5 = round(np.mean(cross_val_score(rf,train_data_5,y_set_3,cv=4, scoring='r2')),3)
adjusted_r2_cv5 =  round(1 - (((1 - r2_cv)*(len(y_set_3)-1))/(len(y_set_3) - train_data_5.shape[1] -1)),3)

print('R-squared cross validation  for set 5: ', r2_cv5)
print('Adjusted R-squared cross validation for set 5: ', adjusted_r2_cv5)

R-squared cross validation  for set 5:  0.531
Adjusted R-squared cross validation for set 5:  0.482


## 8.) Feature Set 6 - Create PCA Features

### 8.1) Perform PCA

In [105]:
# Create a new PCA features: PCA Features. 

pca = PCA(n_components=5, random_state=3)
pca_fea = pca.fit_transform(train_binary_features.drop(labels=883,axis=0))
print('shape of the pca features array: ', pca_fea.shape)

shape of the pca features array:  (4208, 5)


### 8.2) Prepare the train data set 6

In [106]:
# Concatanate the features
train_data_6 = np.hstack((ID_set_3,train_categorical_ohe_set_3,train_binary_features_set_5,pca_fea))
print("shape of the train data set 6: ", train_data_6.shape)

shape of the train data set 6:  (4208, 414)


### 8.3) RandomForest On Set 6

 #### 8.3.1) Build the RF Regressor

In [107]:
# Fit a RF Regressor on Set 6

rf = RandomForestRegressor(n_estimators=20, max_depth=3, random_state=3)
rf.fit(train_data_6,y_set_3)

RandomForestRegressor(max_depth=3, n_estimators=20, random_state=3)

#### 8.3.2) Get R2 score

In [108]:
# Get the R2 Score for Feature Set 6

r2_training6 =  round(r2_score(y_set_3,rf.predict(train_data_6)),3)
adjusted_r2_training6 = round(1 - (((1 - r2_training)*(len(y_set_3)-1))/(len(y_set_3) - train_data_6.shape[1] -1)),3)

print('R-squared for training data for set 6: ',r2_training6)
print('Adjusted R-squared for training data set 6: ', adjusted_r2_training6)

R-squared for training data for set 6:  0.605
Adjusted R-squared for training data set 6:  0.562


#### 8.3.2) Get cross_val R2 Score

In [109]:
# Get the cross_val R2 Score for Feature Set 6

r2_cv6 = round(np.mean(cross_val_score(rf,train_data_6,y_set_3,cv=4, scoring='r2')),3)
adjusted_r2_cv6 =  round(1 - (((1 - r2_cv)*(len(y_set_3)-1))/(len(y_set_3) - train_data_6.shape[1] -1)),3)

print('R-squared cross validation  for set 6: ', r2_cv6)
print('Adjusted R-squared cross validation  for set 6: ', adjusted_r2_cv6)

R-squared cross validation  for set 6:  0.533
Adjusted R-squared cross validation  for set 6:  0.482


## 9.) Results of Feature Engineering

In [110]:
# Colate the R-squared scores for all feature-sets in table

Fe_Results = PrettyTable(["S.No.", "Feature Set", "R2", "Cross_val R2", "Adjusted R2", "Cross_val Adjusted R2"])
Fe_Results.add_row(["1", "With Original Features", r2_training1, adjusted_r2_training1, r2_cv1, adjusted_r2_cv1])
Fe_Results.add_row(["2", "Remove Small R2 Features",  r2_training2, adjusted_r2_training2, r2_cv2, adjusted_r2_cv2])
Fe_Results.add_row(["3", "Remove Outliers",  r2_training3, adjusted_r2_training3, r2_cv3, adjusted_r2_cv3])
Fe_Results.add_row(["4", "Remove Correlated Features",  r2_training4, adjusted_r2_training4, r2_cv4, adjusted_r2_cv4])
Fe_Results.add_row(["5", "Add Interaction Features",  r2_training5, adjusted_r2_training5, r2_cv5, adjusted_r2_cv5])
Fe_Results.add_row(["6", "Add PCA Features", r2_training6, adjusted_r2_training6, r2_cv6, adjusted_r2_cv6])

In [111]:
# Show the result table
print(Fe_Results)

+-------+----------------------------+-------+--------------+-------------+-----------------------+
| S.No. |        Feature Set         |   R2  | Cross_val R2 | Adjusted R2 | Cross_val Adjusted R2 |
+-------+----------------------------+-------+--------------+-------------+-----------------------+
|   1   |   With Original Features   | 0.576 |    0.544     |    0.512    |          0.46         |
|   2   |  Remove Small R2 Features  | 0.576 |    0.557     |    0.511    |         0.475         |
|   3   |      Remove Outliers       | 0.603 |    0.557     |    0.523    |         0.475         |
|   4   | Remove Correlated Featyres | 0.597 |    0.563     |    0.505    |         0.483         |
|   5   |  Add Interaction Features  | 0.605 |    0.563     |    0.531    |         0.482         |
|   6   |      Add PCA Features      | 0.605 |    0.562     |    0.533    |         0.482         |
+-------+----------------------------+-------+--------------+-------------+-----------------------+
