# ISYS2407 Information Systems Solutions & Design

# Binary Classification Algorithms

###### © France and Christopher Cheong 2020

# 1 Import libraries

In [None]:
# Library for pickling
import joblib

# Library for splitting the data into train and test sets
from sklearn.model_selection import train_test_split 

# Other libraries will be imported later as and when they are needed

# 2 Load the cleaned data

#### Pickled file must exist in your folder

In [None]:
# Load the pickled file
diabetes_df = joblib.load('diabetes-cleaned.pkl')  

# Check
diabetes_df.head()

# 3 Split the data into training and testing sets

In [None]:
# Store the features in variable X (uppercase as there are multiple features)

# Features are variables that affect the target/label
# So, it's all the columns excluding the target column
# However, you may also use a subset of features previously identified as best features
# You might want to experiment with both the full set and the best features
feature_cols = [
    'num_pregnancies', 
    'glucose', 
    'blood_pressure', 
    'skin_thickness',
    'insulin', 
    'bmi', 
    'pedigree', 
    'age'
]

X = diabetes_df[feature_cols]
#print('X:\n', X)

# Store the labels/target in variable y (lower case as its a single value)
y = diabetes_df['outcome']
#print('y:\n', y)

# Split into train/test set
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, # keep 20% for testing
                                                    random_state=2 # pass an int for reproducible rtesult
                                                    )

In [None]:
# Check
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

# 4 Algorithm 1:  Logistic Regression

In [None]:
# Import the model library
from sklearn.linear_model import LogisticRegression

# Instantiate model and fit on training data
#lr_model = LogisticRegression(solver='lbfgs')
#lr_model.fit(X_train, y_train)
# Note the above 2 steps can be combined into a single step
# Warning: TOTAL NO. of ITERATIONS REACHED LIMIT, Increase the number of iterations (max_iter)
#lr_model = LogisticRegression(solver='lbfgs').fit(X_train, y_train)
lr_model = LogisticRegression(solver='lbfgs', max_iter=200).fit(X_train, y_train) # default max_iter is 100

# Predict using test data
y_pred = lr_model.predict(X_test)

# Quick check
# Print a few elements from the vector
# Slice the elements of the array from the beginning to index 5 (not included)
print(y_pred[:5]) # [:5] print first 5 elements
# Examples: https://www.w3schools.com/python/numpy_array_slicing.asp

# 4 Algorithm 2:  KNN

In [None]:
# Import the model library
from sklearn.neighbors import KNeighborsClassifier

# Instantiate model and fit on training data
knn_model = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)

# Predict using test data
y_pred = knn_model.predict(X_test)

# Quick check
print(y_pred[:5])

# 5 Algorithm 3: SVM

In [None]:
# Import the model library
from sklearn.svm import SVC

# Instantiate model and fit on training data
svm_model = SVC(kernel='linear', 
            class_weight='balanced', # penalize
            probability=True).fit(X_train, y_train)

# Predict using test data
y_pred = svm_model.predict(X_test)

# Quick check
print(y_pred[:5])

# 6 Algorithm 4: Decision Trees (Random Forests)

In [None]:
# Import the model library
from sklearn.ensemble import RandomForestClassifier

# Instantiate model and fit on training data
rf_model = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)

# Predict using test data
y_pred = rf_model.predict(X_test)

# Quick check
print(y_pred[:5])

# 7 Algorithm 5:  Naive Bayes

In [None]:
# Import the model library
from sklearn.naive_bayes import GaussianNB

# Instantiate model and fit on training data
nb_model = GaussianNB().fit(X_train, y_train)

# Predict using test data
y_pred = rf_model.predict(X_test)

# Quick check
print(y_pred[:5])