# ISYS2407 Information Systems Solutions & Design

# Feature Selection

###### © France and Christopher Cheong 2020

# 1 Import libraries

In [None]:
# Library for pickling
import joblib

# Also need pandas and nympy for some tasks
import pandas as pd
import numpy as np

# Library for splitting the data into train and test sets
from sklearn.model_selection import train_test_split 

# Libraries to select k best features
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# 2 Load the cleaned data set

#### Pickled file must exist in your folder

In [None]:
diabetes_df = joblib.load('diabetes-cleaned.pkl')   

In [None]:
# Check
diabetes_df.head()

In [None]:
# Check
print(diabetes_df.shape)
print(diabetes_df.columns)

# 3 Split the data into training and testing set

In [None]:
# Store the features in variable X (uppercase as there are multiple features)
# Features are variables that affect the target/label
# So, it's all the columns excluding the target column
feature_cols = ['num_pregnancies', 
                'glucose', 
                'blood_pressure', 
                'skin_thickness',
                'insulin', 
                'bmi', 
                'pedigree', 
                'age'
               ]

X = diabetes_df[feature_cols]
#print('X:\n', X)

# Store the labels/target in variable y (lower case as its a single value)
y = diabetes_df['outcome']
#print('y:\n', y)

# Split into train/test set
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, # keep 20% for testing
                                                    random_state=2 # pass an int for reproducible rtesult
                                                    )

# 4 Feature selection - trial-and-error process

#### Must be done after the data is split into training and testing sets and before scaling and on the training set only

In [None]:
# (614, 8) ==> 9 features
X_train.shape

## 4.1 Attempt 1

In [None]:
# This is a trial and error process
# There are 8 features in this data set
# First let's try k=7
X_selected = SelectKBest(chi2, k=7).fit_transform(X_train, y_train)

# Print the type and shape of the selected features
print(type(X_selected))
print(X_selected.shape)

# Print the first 5 rows of the selected features
#np.set_printoptions(suppress=True) # Suppress scientific notation when printing
print(X_selected[:5])

# Print the first 5 rows of all the features in the dataset
X_train.head()

## 4.2 Attempt 2

In [None]:
# Try k=6
X_selected = SelectKBest(chi2, k=6).fit_transform(X_train, y_train)

# Print the type and shape of the selected features
print(type(X_selected))
print(X_selected.shape)

# Print the first 5 rows of the selected features
print(X_selected[:5])

# Print the first 5 rows of all the features in the dataset
X_train.head()

## 4.3 Attempt 3

In [None]:
# Try k=5
X_selected = SelectKBest(chi2, k=5).fit_transform(X_train, y_train)

# Print the type and shape of the selected features
print(type(X_selected))
print(X_selected.shape)

# Print the first 5 rows of the selected features
print(X_selected[:5])

# Print the first 5 rows of all the features in the dataset
X_train.head()

## 4.4 Attempt 4

In [None]:
# Try k=4
X_selected = SelectKBest(chi2, k=4).fit_transform(X_train, y_train)

# Print the type and shape of the selected features
print(type(X_selected))
print(X_selected.shape)

# Print the first 5 rows of the selected features
print(X_selected[:5])

# Print the first 5 rows of all the features in the dataset
X_train.head()

#### 4.1 What are the best features for building predictive models?

From the analysis performed, it seems that the best features are: glucose, skin_thickness, bmi, age.
    
However, when performing the modelling part, you may experiment with models built using both the full set of features as well as the best features.    