# Classification Models Project

### Setup imports and encode values

In [None]:
import os
import numpy as np
import pandas as pd
import collections
from sklearn import preprocessing
import matplotlib.pyplot as plt
import shutil
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(data, name):
    le = preprocessing.LabelEncoder()
    data[name] = le.fit_transform(data[name])
    return le.classes_

## Classification Models on Titanic dataset

In [None]:
titanic_train = pd.read_csv("./titanic_train.csv")
# Drop passenger name, ticket number, cabin, embarked.
titanic_train.drop(columns=['Name', 'Ticket', 'Cabin', 'Embarked'], inplace=True)
# Replace NaNs in Age with mean
titanic_train['Age'].fillna(titanic_train['Age'].mean(), inplace=True)
# Encode 'Sex' column
titanic_train['Sex'] = pd.get_dummies(titanic_train['Sex'], drop_first=True)
titanic_train.head(10)

# Prepare training features and labels.

In [None]:
X_titanic = titanic_train[['Pclass', 'Sex', 'Age', 'SibSp', 'Fare']].values
y_titanic = titanic_train['Survived'].values

X_train, X_test, y_train, y_test = train_test_split(X_titanic, y_titanic, test_size=0.3, random_state=0)
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

X_train[:5]

# Apply decision tree similar to Tutorial_6_Classification.ipynb

In [None]:
from sklearn import tree

titanic_clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3)
titanic_clf = titanic_clf.fit(X_train, y_train)

# Predict on test data
titanic_predictions = titanic_clf.predict(X_test)
titanic_predictions


# Plot confusion matrix for titanic classifier

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score,recall_score,classification_report, ConfusionMatrixDisplay, confusion_matrix

def classification_report_and_cm(y_test, y_pred):
  print('Accuracy on test data is %.2f' % (accuracy_score(y_test, y_pred)))
  print('F1 score on test data is %.2f' % (f1_score(y_test, y_pred)))
  print('Precision Score on test data is %.2f' % (precision_score(y_test, y_pred)))
  print('Recall score on test data is %.2f' % (recall_score(y_test, y_pred)))
  print( classification_report(y_test,y_pred))

  cm = confusion_matrix(y_test, y_pred)
  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels= ["Not survived", "Survived"])
  disp.plot()
  plt.show()

In [None]:
classification_report_and_cm(y_test, titanic_predictions)

# Plot Correlation Matrix for Titanic Data attributes

In [None]:
corr = titanic_train.corr()
corr.style.background_gradient(cmap='coolwarm')

# Decision tree performance on Titanic data with varying tree depth.

In [None]:
from sklearn.metrics import RocCurveDisplay

treeDepth = [3, 4, 5]
testAcc = []
trainAcc = []

fig, ax_roc = plt.subplots(1, 1, figsize=(11, 5))
ax_roc.grid(linestyle="--")

for k in treeDepth:
    clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=k)
    clf.fit(X_train, y_train)
    RocCurveDisplay.from_estimator(clf, X_test, y_test, ax=ax_roc, name="Decision Tree Classifier depth=%d" % k)

ax_roc.set_title("Receiver Operating Characteristic (ROC) curves")
plt.legend()
plt.show()


It can be seen that depth = 3 is the best choice between 3, 4, and 5. 

# KNN Classifier on Titanic data

In [None]:
from sklearn.neighbors import KNeighborsClassifier

numNeighbors = [1, 5, 10, 15, 20, 25, 30, 40]
testAcc = []
trainAcc = []

for k in numNeighbors:
    clf = KNeighborsClassifier(n_neighbors=k, metric='minkowski', p=2)
    clf.fit(X_train, y_train)
    knn_pred = clf.predict(X_test)
    knn_pred_train = clf.predict(X_train)
    # print(knn_pred)
    testAcc.append(accuracy_score(y_test, knn_pred))
    trainAcc.append(accuracy_score(y_train,knn_pred_train))

plt.plot(numNeighbors, testAcc,'bv--',numNeighbors, trainAcc, 'ro--')
plt.legend(['Test Accuracy','Train Accuacy'])
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')

The optimal number of neighbors for the KNN classifer on Titanic Data is 20.

# Seaborn Plots to Illustrate Classifier Performance for Titanic Dataset

In [None]:
import seaborn as sns
# Paired plot using seaborn.set()
sns.pairplot(titanic_train[['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'
]],
             hue='Pclass', diag_kind="kde")

# Using the Churn Data set
Here, we are taking the churn dataset and applying preprocessing and converting rows with String data type into integer types. Aterwards, we seperate the dataset into train and testing and predict the accuracy of the model. Using a classification decision tree, we will be able to determine what features are uncessary and what are needed.

In [None]:
path = "./"
# Open .csv
filename_read = os.path.join(path,"Churn_Modelling.csv")
churn_Data = pd.read_csv(filename_read, na_values=['NA','?'])
churn_Data.head(10)

In [None]:
dups = churn_Data.duplicated()
print('Number of duplicate rows = %d' % (dups.sum()))
print('Number of empty values in a row = %d' % (churn_Data.isnull().any(axis = 1).sum()))

Here, we are able to identify that the classification for this dataset uses Exited, signifying if the user is has left or not. We then take all the features from the row, dropping the columns uncessary to our classfication (which are the RowNumber, CustomerId, and Surname) as they provide no classification as to if the user has exited or not.

In [None]:
# Drop unnecessary features from the dataset.
churn_Data = churn_Data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
encode_text_index(churn_Data, 'Geography')
encode_text_index(churn_Data, 'Gender')
churn_Data.head(10)

In [None]:
churn_Data.dtypes

Here, we are using all the features given to us to create our deicsion tree.

In [None]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

feature_cols = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']

x = churn_Data[feature_cols]
y = churn_Data.Exited

In [None]:
# Split dataset into training set and test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

## Building Decision Tree

Building decision trees involves making a predictive model to recursively divide feature spaces into smaller and smaller sections. 
Decision trees improves the our ability to understand how the model arrived at specific classifications by mimicking the human decision-making patterns. Since they can work on both numerical and categorical features without needing much data preprocessing they are extremely convenient to use. Decision trees also can show a measure of feature importance, which helps us identify which features are more influential in making decisions, allowing us to tweak weights and aides in feature selection. Furthermore, They can identify non-linear relationships in the data, since they do not make any assumptions about the linearity of the data. This makes them useful for datasets with intricate relationships between features and the target variable.

Decision trees have a tendency to overfit, especially when they "grow too deep" or if the dataset is noisy. They are also unstable, with massive changes in the tree if the dataset changes. They also have tendency to favor features with a lot of unique values because they can create more branches which may result in more specific rules.

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=4)
# Train Decision Tree Classifer
clf = clf.fit(x_train,y_train)
#Predict the response for test dataset
y_pred = clf.predict(x_test)

### Evaluating Model accuracy
As we can see here, using all of the features resulted in us have a high preicsion and accuracy which make sense as we are using all of the provided features to identify between Exited (1) or Not (0) users.

In [None]:
classification_report_and_cm(y_test, y_pred)

Using graphviz, we are able to create the decision tree image of all the features from the churn dataset. As seen below, the tree has an imbalance and is having trouble classifying which account has exited or not (1 or 0).

In [None]:
import pydotplus 
from IPython.display import Image
from sklearn.tree import export_graphviz

dot_data = export_graphviz(clf, feature_names=x.columns, class_names=['0','1'], filled=True, 
                                out_file=None) 
graph = pydotplus.graph_from_dot_data(dot_data) 
Image(graph.create_png())

The code below sets up the decision tree classifier, trains it using feature columns: 'CreditScore', 'Balance', 'NumOfProducts', and the 'Exited' target variable. We use these features as it has shown that these three features provided the most accurate score we were able to find and was able to clearly classify if an account has Exited or Not (1 or 0). It then checks its performance on a test set and displays the accuracy achieved by the decision tree model in predicting the 'Exited' variable. 

In [None]:
feature_cols = ['CreditScore', 'Balance', 'NumOfProducts']

x = churn_Data[feature_cols]
y = churn_Data.Exited

# Split dataset into training set and test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=4)
# Train Decision Tree Classifer
clf = clf.fit(x_train,y_train)
#Predict the response for test dataset
y_pred = clf.predict(x_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
import pydotplus 
from IPython.display import Image
from sklearn.tree import export_graphviz

dot_data = export_graphviz(clf, feature_names=x.columns, class_names=['0','1'], filled=True, 
                                out_file=None) 
graph = pydotplus.graph_from_dot_data(dot_data) 
Image(graph.create_png())

## Logistic Regression (Logit)

We apply logistic regression of varying strength to the train and test. We use a range of number for C, starting off from a smaller value for strong regularization to greater values. This is to ensure that we are able to obtain the accuracy of our train and test base off the varying regularization strengths that the test and train goes through.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 

#C = Inverse of regularization strength;  smaller values specify stronger regularization.
C = [0.01, 0.1, 0.2, 0.5, 0.8, 1, 5, 10, 20, 50]

#finding test accuracy and train accuracy
LRtestAcc = []
LRtrainAcc = []

#for loop that applies logistic regression at different C levels and stores values to lists
for param in C:
    clf = LogisticRegression(C=param)
    clf.fit(x_train,y_train)
    log_reg_pred = clf.predict(x_test)
    log_reg_pred_train = clf.predict(x_train)
    # print(log_reg_pred)
    LRtestAcc.append(accuracy_score(y_test, log_reg_pred))
    LRtrainAcc.append(accuracy_score(y_train,log_reg_pred_train))
     #display accuracy of test 
    print ("Accuracy for C=%.2f: "%param,  accuracy_score(y_test, log_reg_pred))  

#display a plot
plt.plot(C, LRtestAcc,'bv--',C,LRtrainAcc,'ro--')
plt.legend(['Test Accuracy','Train Accuracy'])
plt.ylim(0.75, 0.85)
plt.xlabel('C')
plt.xscale('log')
plt.ylabel('Accuracy')  

# Seaborn Plots to Illustrate Classifier Performance for Churn Dataset

In [None]:
sns.pairplot(churn_Data[['Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']], 
            hue= 'HasCrCard', diag_kind="kde")

# Naive Bayes
In this section, we apply the classifcation method: Naive Bayes, where we take two approaches, SVC (the linear approach) and SVM (non-linear approach). <br />
SVC = Linear Approach <br />
SVM = Non Linear Approach

In [None]:
from sklearn.naive_bayes import GaussianNB

clf_NB = GaussianNB()
clf_NB.fit(x,y)
NB_pred = clf_NB.predict(x_test)
print(NB_pred)

print('Accuracy on test data is %.2f' % (accuracy_score(y_test, NB_pred)))

## Support Vector Machine (SVM) Classifer
C = determines the influence of misclassification  <br />

In [None]:
from sklearn import svm
from sklearn.svm import LinearSVC

svmClassifier = svm.SVC(kernel='linear', gamma='auto', C=2, max_iter=1000)
# svmClassifier = svm.LinearSVC(C=2)
svmClassifier.fit(x_train, y_train)

y_svmPred = svmClassifier.predict(x_test)

In [None]:
from sklearn.metrics import classification_report

print('Accuracy on test data is %.2f' % (accuracy_score(y_test, y_svmPred)))
print(classification_report(y_test, y_svmPred))

## Plot Confusion Matrix for SVM Classifier

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_test, y_svmPred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=svmClassifier.classes_)

disp.plot()

plt.show()

This code block applies the non-linear SVM classifier which uses the C parameter to affect the accuracy and predicted results. After running, a graph is generated for the test and train accuracy depending on the C parameter.

In [None]:
from sklearn.svm import SVC
svc = SVC()

C = [0.01, 0.1, 0.2, 0.5, 0.8, 1, 5, 10, 20, 50]


SVMLtestAcc = []
SVMLtrainAcc = []

for param in C:
    clf = SVC(C=param,kernel='rbf',gamma='auto')
    clf.fit(x_train,y_train)
    svml_pred = clf.predict(x_test)
    svml_pred_train = clf.predict(x_train)
    print(svml_pred)
    SVMLtestAcc.append(accuracy_score(y_test, svml_pred))
    SVMLtrainAcc.append(accuracy_score(y_train,svml_pred_train))

In [None]:
plt.plot(C, SVMLtestAcc,'ro--', C,SVMLtrainAcc,'bv--')
plt.legend(['Test Accuracy','Train Accuracy'])
plt.xlabel('C')
plt.xscale('log')
# plt.xlim(0, 1)
plt.ylabel('Accuracy') 

We observe that SVM with radial basis kernel overfits on the training set when regularization is not strong enough.
For sklearn.svm.SVC the strength of the regularization is inversely proportional to C.
In our tests the best value for C seems to be between 0.01 and 0.5.

# Plot Correlation Matrix for Churn Data attributes

In [None]:
corr = churn_Data.corr()
corr.style.background_gradient(cmap='coolwarm')

# Decision tree performance on Churn data with varying tree depth.

In [None]:
from sklearn.metrics import RocCurveDisplay

treeDepth = [3, 7, 11]
testAcc = []
trainAcc = []

fig, ax_roc = plt.subplots(1, 1, figsize=(11, 5))
ax_roc.grid(linestyle="--")

for k in treeDepth:
    clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=k)
    clf.fit(x_train, y_train)
    RocCurveDisplay.from_estimator(clf, x_test, y_test, ax=ax_roc, name="Decision Tree Classifier depth=%d" % k)

ax_roc.set_title("Receiver Operating Characteristic (ROC) curves")
plt.legend()
plt.show()

## K-Nearest Neighbor (KNN) Classifiers

In [None]:
from sklearn.neighbors import KNeighborsClassifier

numNeighbors = [5, 10, 20, 30, 40]
testAcc = []
trainAcc = []

for k in numNeighbors:
    clf = KNeighborsClassifier(n_neighbors=k, metric='minkowski', p=2)
    clf.fit(x_train, y_train)
    knn_pred = clf.predict(x_test)
    knn_pred_train = clf.predict(x_train)
    # print(knn_pred)
    testAcc.append(accuracy_score(y_test, knn_pred))
    trainAcc.append(accuracy_score(y_train,knn_pred_train))
    print('Accuracy on test data using k=%i is %.2f' % (k, accuracy_score(y_test, knn_pred)))
    print('Accuracy on train data using k=%i is %.2f' % (k, accuracy_score(y_train, knn_pred_train)))
    
plt.plot(numNeighbors, testAcc,'bv--',numNeighbors, trainAcc, 'ro--')
plt.legend(['Test Accuracy','Train Accuacy'])
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')

The K-value that seems to be an accurate representation of the model is when k >= 30 as we can see on the graph. When comparing the two lines, training and testing, we see that the model stops and stay on a constant value on the graph 0.79 and 0.8. Upon analyizing this, we can tell that the accuracy of the model using the KNN technique is around 0.79 ~ 0.8.