# ISYS2407 Information Systems Solutions & Design

# Discretization, One-Hot encoding and model building


###### © France and Christopher Cheong 2020

# 1 Import libraries

In [None]:
# Import the library for pickling
import joblib

# Library needed for counting categorical values
import collections

# Also need pandas here
import pandas as pd

# Library for replacing labels with numbers
from sklearn.preprocessing import LabelEncoder

# Libary needed for onehot encoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
# Just use pandas get_dummies which is a lot easier to use

# Library for splitting the data into train and test sets
from sklearn.model_selection import train_test_split 

# Import the model library
from sklearn.linear_model import LogisticRegression

# Import the libraries for computing the metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

# Import the plotting libraries
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns

# 2 Load the cleaned data

#### Pickled file must exist in your folder

In [None]:
# Load the pickled file
diabetes_df = joblib.load('diabetes-cleaned.pkl')  

# Check
diabetes_df.head()

# 3 Split the data into training and testing set

In [None]:
# Store the features in variable X (uppercase as there are multiple features)

# Features are variables that affect the target/label
# So, it's all the columns excluding the target column
# However, you may also use a subset of features previously identified as best features
# You might want to experiment with both the full set and the best features
feature_cols = [
    'num_pregnancies', 
    'glucose', 
    'blood_pressure',
    'skin_thickness',
    'insulin', 
    'bmi', 
    'pedigree', 
    'age'
]

X = diabetes_df[feature_cols]
#print('X:\n', X)

# Store the labels/target in variable y (lower case as its a single value)
y = diabetes_df['outcome']
#print('y:\n', y)

# Split into train/test set
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, # keep 20% for testing
                                                    random_state=2 # pass an int for reproducible rtesult
                                                    )

# 4 Data Transformations to improve modelling performance

#### Should be performed after the data has been split into training and testing sets to prevent information leakage (train-test contamination)

## 4.1 Discretization of continuous variables

In [None]:
# When checking the number of unique values, the blood pressure column was found to contain 47 values
# This is evidence that it's a continuous variable

# Some classification algorithms do not work well with continuous values - too many values
# So, depending on the classification algorithm you are usind, you might need to categorise these values
# How? Need some domain knowlege
# For blood pressure we could use the following:
# less than 80 = normal, between 80-89 = high, greater than 89 = very high

# Define a function to classify this
# Pass the whole dataframe as parameter
# And return a category (a word label but you could also return numeric labels)
# NOTE: Do not create too many categories as this will affect the learning algorithms
def bp_category(df): 
    if df["blood_pressure"] <= 80:
        return "bp_normal"
    elif (df["blood_pressure"] > 80) & (df["blood_pressure"] <= 89):
        return "bp_high"
    elif df["blood_pressure"] > 89:
        return "bp_very_high"

In [None]:
# Apply the  previously defined function to the training set

# Use apply() to apply a lambda function for all the rows of the dataframe
# The lambda function calls the previously defined function bp_category()
# To which it passes the diabetes_df as parameter
# And a category is returned and saved in a new column named "blood_pressure_category"
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.apply.html
# the backslash is a line continuation character - there should be nothing after it, not even a space

# Warning: A value is trying to be set on a copy of a slice from a DataFrame.
# Try using .loc[row_indexer,col_indexer] = value instead
# Below is another fix - make a copy
X_train = X_train.copy()
X_train["blood_pressure_category"] = \
    X_train.apply(lambda X_train:bp_category(X_train), axis = 1) # axis=1 = row-wise

# The column "blood_pressure" is no longer needed and hence can be deleted 
X_train.drop(['blood_pressure'], axis=1, inplace=True)

# Check
X_train.head()

In [None]:
# Apply the  previously defined function to the testing set
X_test = X_test.copy()
X_test["blood_pressure_category"] = \
    X_test.apply(lambda X_test:bp_category(X_test), axis = 1) # axis=1 = row-wise

# The column "blood_pressure" is no longer needed and hence can be deleted 
X_test.drop(['blood_pressure'], axis=1, inplace=True)

# Check
X_test.head()

## 4.2 One-hot encoding of columns with multiple categories

#### Note: Since one-hot encoding generates lots of dummy variables (columns), this has an impact on certain machine learning algorithms - only use when justified (need to experiment to find out)

In [None]:
# 1. Select the columns to one-hot encode
# Make sure the columns are categorical columns with multiple labels 
# It does not matter whether the labels are words or numbers
# Better list all the columns and comment out the ones you don't need
columns_to_onehot_encode = [
    #'num_pregnancies', 
    #'glucose', 
    #'blood_pressure', # this column no longer exists
    'blood_pressure_category',
    #'skin_thickness',
    #'insulin', 
    #'bmi', 
    #'pedigree', 
    #'age'
]

# 2 Instantiate a one-hot encoder
#enc = OneHotEncoder() # No, this will generate strings, not numbers
enc = LabelBinarizer()

# 3. Fit the encoder on the training column and transform the training and testing columns
# Use a loop to label encode all the required columns 
for col in columns_to_onehot_encode:
    # Option 1. Using pandas (simpler than sklearn)
    ## Fix X_train
    dummies_df = pd.get_dummies(X_train[col], prefix="bpc") # generate dataframe of dummies
    X_train.drop([col], axis=1, inplace=True) # drop original column
    X_train = pd.concat([X_train, dummies_df], axis=1) # concatenate both dataframes
    ## Fix X_test
    dummies_df = pd.get_dummies(X_test[col], prefix="bpc")
    X_test.drop([col], axis=1, inplace=True)
    X_test = pd.concat([X_test, dummies_df], axis=1)
    
    # Option 2: Using sklearn    
    # First fit the encoder to the training data
    #enc.fit(X_train[col].values.reshape(-1, 1)) # need to reshape
    #X_train_array = enc.transform(X_train[col])
    #X_test_array = enc.transform(X_test[col])
    #print(X_train_array)
    #print(X_train_array)
    # numpy arrays generated - have no column names and don't know the order the codes were generated
    # need to write complex code to generate column names for variable number of columns
    # to be able to convert the numpy array to a daframe
    # Better use the pandas solution!
     
# Check
X_train.head()

In [None]:
# Check
X_test.head()

## 4.3 Other data transformations e.g. label encoding, scaling, etc

# 5 Modelling

## 5.1 Initial/baseline model

### 5.1.1 Fit initial model

In [None]:
# The two main steps are:
# 1: Instantiate model and fit on training data
# 2: Predict using test data




### 5.1.2 Evaluate model

In [None]:
# Compute performance metrics of the baseline model



# Continue with the rest of the modelling steps - e.g.

## 5.2 Improved model

### 5.2.1 Use grid search to find best hyperameters (details in grid search notebook)

### 5.2.2 Fit improved model

### 5.2.3 Evaluate  improved model

# 6 Save the model for further evaluation (in another notebook)
#### Should also save the train/test sets

In [None]:
"""
# Uncomment the code if you want to use this i.e. delete the triple quotes at the start and end of this cell

# Pickle the model for later evaluation
joblib.dump(lr_model, 'model-xxx-xxx.pkl')  # Use the right model name and a suitable file name

# Also need to pickle the training and testing sets
joblib.dump(X_train, 'X_train.pkl') 
joblib.dump(X_test, 'X_test.pkl') 
joblib.dump(y_train, 'y_train.pkl') 
joblib.dump(y_test, 'y_test.pkl')

# Note: make sure that the model and the training/testing sets match 
# i.e. the model was built using this particular training set
# and the testing set matches this partiular training
"""