# 1. Import libraries

In [1]:
# Library for pickling
import joblib

# Also need pandas and nympy for some tasks
import pandas as pd
import numpy as np

# Library for splitting the data into train and test sets
from sklearn.model_selection import train_test_split 

# Libraries to select k best features
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# 2. Load the cleaned and pickled data

In [2]:
personal_loan_df = joblib.load('personal_loan_cleaned.pkl')

personal_loan_df.head()

Unnamed: 0,customer_id,age,yrs_experience,family_size,education_level,income,mortgage_amt,credit_card_acct,credit_card_spend,share_trading_acct,fixed_deposit_acct,online_acct,personal_loan
0,3815.0,40.0,9.0,3.0,Graduate,60.0,0.0,yes,1.7,0,0,yes,0
1,5737.0,65.0,27.0,4.0,Advanced or Professional,36.0,90.0,no,0.0,0,0,yes,0
3,4734.0,29.0,14.0,1.0,Undergraduate,52.0,0.0,no,0.0,1,0,yes,0
4,2551.0,43.0,23.0,4.0,Graduate,33.0,0.0,no,0.0,0,0,yes,0
5,4491.0,44.0,18.0,3.0,Advanced or Professional,49.0,0.0,no,0.0,0,0,yes,0


In [3]:
print(personal_loan_df.shape)
print(personal_loan_df.columns)

(5363, 13)
Index(['customer_id', 'age', 'yrs_experience', 'family_size',
       'education_level', 'income', 'mortgage_amt', 'credit_card_acct',
       'credit_card_spend', 'share_trading_acct', 'fixed_deposit_acct',
       'online_acct', 'personal_loan'],
      dtype='object')


In [4]:
# Out of these 8 features, 4 of them were discovered to have strong co-relation
# However,there are 8 features included
# to test out which features gets selected by the score fucntion .

feature_cols = [
    'customer_id',
    'age',
    'yrs_experience',
    'family_size',
    'income',
    'mortgage_amt',
    'credit_card_spend',
    'fixed_deposit_acct'
]


# Storing the feature_cols inside the X
# The variable is an uppercase letter as it is storing multiple features
X = personal_loan_df[feature_cols]

# Storing the personal_loan column in y
#  The variable is lowercase here, because its storing one variable
y = personal_loan_df['personal_loan']

# Splitting both X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.2,    # Splitting and assigning 20% of each X and y for testing
                                                    random_state = 2    # Passing an int for reproducible result
                                                    )

# Experiment with feature selection

# Attempt 1

In [5]:
X_train.shape

(4290, 8)

In [6]:
# This is a trial and error process
# There are 8 features in this data set
# First let's try k=8 as there are 8 features
# Instantiating and transforming on train set of X and y
# k is the number of best features to find
X_selected = SelectKBest(chi2, k=8).fit_transform(X_train, y_train)

# Print the type and shape of the selected features
print(type(X_selected))
print(X_selected.shape)

# Print the first 5 rows of the selected features
np.set_printoptions(suppress=True) # Suppress scientific notation when printing
print(X_selected[:5])

# Print the first 5 rows of all the features in the dataset
X_train.head()

<class 'numpy.ndarray'>
(4290, 8)
[[1832.    56.     4.     3.   118.   131.     3.2    0. ]
 [1150.    30.    29.     3.    23.     0.     3.9    0. ]
 [5610.    58.     0.     4.   120.     0.     0.     0. ]
 [4833.    55.    41.     3.    69.   107.     2.5    0. ]
 [2661.    40.    36.     2.    89.    90.     3.7    0. ]]


Unnamed: 0,customer_id,age,yrs_experience,family_size,income,mortgage_amt,credit_card_spend,fixed_deposit_acct
5152,1832.0,56.0,4.0,3.0,118.0,131.0,3.2,0
2820,1150.0,30.0,29.0,3.0,23.0,0.0,3.9,0
4450,5610.0,58.0,0.0,4.0,120.0,0.0,0.0,0
2715,4833.0,55.0,41.0,3.0,69.0,107.0,2.5,0
210,2661.0,40.0,36.0,2.0,89.0,90.0,3.7,0


# Attempt 2

In [7]:
# Try k=7
X_selected = SelectKBest(chi2, k=7).fit_transform(X_train, y_train)

# Print the type and shape of the selected features
print(type(X_selected))
print(X_selected.shape)

# Print the first 5 rows of the selected features
print(X_selected[:5])

# Print the first 5 rows of all the features in the dataset
X_train.head()

<class 'numpy.ndarray'>
(4290, 7)
[[1832.     4.     3.   118.   131.     3.2    0. ]
 [1150.    29.     3.    23.     0.     3.9    0. ]
 [5610.     0.     4.   120.     0.     0.     0. ]
 [4833.    41.     3.    69.   107.     2.5    0. ]
 [2661.    36.     2.    89.    90.     3.7    0. ]]


Unnamed: 0,customer_id,age,yrs_experience,family_size,income,mortgage_amt,credit_card_spend,fixed_deposit_acct
5152,1832.0,56.0,4.0,3.0,118.0,131.0,3.2,0
2820,1150.0,30.0,29.0,3.0,23.0,0.0,3.9,0
4450,5610.0,58.0,0.0,4.0,120.0,0.0,0.0,0
2715,4833.0,55.0,41.0,3.0,69.0,107.0,2.5,0
210,2661.0,40.0,36.0,2.0,89.0,90.0,3.7,0


# Attempt 3

In [8]:
# Try k=6
X_selected = SelectKBest(chi2, k=6).fit_transform(X_train, y_train)

# Print the type and shape of the selected features
print(type(X_selected))
print(X_selected.shape)

# Print the first 5 rows of the selected features
print(X_selected[:5])

# Print the first 5 rows of all the features in the dataset
X_train.head()

<class 'numpy.ndarray'>
(4290, 6)
[[  4.    3.  118.  131.    3.2   0. ]
 [ 29.    3.   23.    0.    3.9   0. ]
 [  0.    4.  120.    0.    0.    0. ]
 [ 41.    3.   69.  107.    2.5   0. ]
 [ 36.    2.   89.   90.    3.7   0. ]]


Unnamed: 0,customer_id,age,yrs_experience,family_size,income,mortgage_amt,credit_card_spend,fixed_deposit_acct
5152,1832.0,56.0,4.0,3.0,118.0,131.0,3.2,0
2820,1150.0,30.0,29.0,3.0,23.0,0.0,3.9,0
4450,5610.0,58.0,0.0,4.0,120.0,0.0,0.0,0
2715,4833.0,55.0,41.0,3.0,69.0,107.0,2.5,0
210,2661.0,40.0,36.0,2.0,89.0,90.0,3.7,0


# Attempt 4

In [9]:
# Try k=5
X_selected = SelectKBest(chi2, k=5).fit_transform(X_train, y_train)

# Print the type and shape of the selected features
print(type(X_selected))
print(X_selected.shape)

# Print the first 5 rows of the selected features
print(X_selected[:5])

# Print the first 5 rows of all the features in the dataset
X_train.head()

<class 'numpy.ndarray'>
(4290, 5)
[[  4.  118.  131.    3.2   0. ]
 [ 29.   23.    0.    3.9   0. ]
 [  0.  120.    0.    0.    0. ]
 [ 41.   69.  107.    2.5   0. ]
 [ 36.   89.   90.    3.7   0. ]]


Unnamed: 0,customer_id,age,yrs_experience,family_size,income,mortgage_amt,credit_card_spend,fixed_deposit_acct
5152,1832.0,56.0,4.0,3.0,118.0,131.0,3.2,0
2820,1150.0,30.0,29.0,3.0,23.0,0.0,3.9,0
4450,5610.0,58.0,0.0,4.0,120.0,0.0,0.0,0
2715,4833.0,55.0,41.0,3.0,69.0,107.0,2.5,0
210,2661.0,40.0,36.0,2.0,89.0,90.0,3.7,0


# Attempt 5

In [10]:
# Try k=4
X_selected = SelectKBest(chi2, k=4).fit_transform(X_train, y_train)

# Print the type and shape of the selected features
print(type(X_selected))
print(X_selected.shape)

# Print the first 5 rows of the selected features
print(X_selected[:5])

# Print the first 5 rows of all the features in the dataset
X_train.head()

<class 'numpy.ndarray'>
(4290, 4)
[[118.  131.    3.2   0. ]
 [ 23.    0.    3.9   0. ]
 [120.    0.    0.    0. ]
 [ 69.  107.    2.5   0. ]
 [ 89.   90.    3.7   0. ]]


Unnamed: 0,customer_id,age,yrs_experience,family_size,income,mortgage_amt,credit_card_spend,fixed_deposit_acct
5152,1832.0,56.0,4.0,3.0,118.0,131.0,3.2,0
2820,1150.0,30.0,29.0,3.0,23.0,0.0,3.9,0
4450,5610.0,58.0,0.0,4.0,120.0,0.0,0.0,0
2715,4833.0,55.0,41.0,3.0,69.0,107.0,2.5,0
210,2661.0,40.0,36.0,2.0,89.0,90.0,3.7,0


In the 5th attempt, by using k value 4, it could be observed that all the other features were removed and only the features with the strongest co-relation stayed.

# Attempt 6

In [11]:
# Try k=3
X_selected = SelectKBest(chi2, k=3).fit_transform(X_train, y_train)

# Print the type and shape of the selected features
print(type(X_selected))
print(X_selected.shape)

# Print the first 5 rows of the selected features
print(X_selected[:5])

# Print the first 5 rows of all the features in the dataset
X_train.head()

<class 'numpy.ndarray'>
(4290, 3)
[[118.  131.    3.2]
 [ 23.    0.    3.9]
 [120.    0.    0. ]
 [ 69.  107.    2.5]
 [ 89.   90.    3.7]]


Unnamed: 0,customer_id,age,yrs_experience,family_size,income,mortgage_amt,credit_card_spend,fixed_deposit_acct
5152,1832.0,56.0,4.0,3.0,118.0,131.0,3.2,0
2820,1150.0,30.0,29.0,3.0,23.0,0.0,3.9,0
4450,5610.0,58.0,0.0,4.0,120.0,0.0,0.0,0
2715,4833.0,55.0,41.0,3.0,69.0,107.0,2.5,0
210,2661.0,40.0,36.0,2.0,89.0,90.0,3.7,0
