In [1]:
# 01/25

In [2]:
from ipynb.fs.full.performance import performanceEval as perf
from sklearn import tree
import matplotlib.pyplot as plt
import numpy as np

class baseModel:
  def __init__(self, model, modelName):
    self.model = model
    self.modelName = modelName
  
  def train(self, Xtrain, Ytrain):
    self.model.fit(Xtrain, Ytrain)
    if self.modelName == "dtModel": 
      plt.figure(figsize=(20, 10), dpi=200)  # bigger figure and higher DPI
      tree.plot_tree(self.model, filled=True)
      plt.show()
    return ["training complete", self]
  
  def test(self, Xtest):
    Ypred = self.model.predict(Xtest)
    return Ypred
  
  def perfEval(self, Ypred, Ytest):
    return perf(Ypred, Ytest, self.modelName)

In [3]:
# 10/24
######### LOGISTIC REGRESSION #########
from sklearn.linear_model import LogisticRegression

class logReg(baseModel):
  def __init__(self):
    super().__init__(LogisticRegression(random_state=0), "regModel")
  

  def computeClassBound3D(self, df):
    df = df.drop('diagnosis', axis=1)
    params = self.model.coef_[0] # get the SKL learning parameters
    intercept = self.model.intercept_[0]
    sampleX0, sampleX1, means = self.findSampleX(df)

    # Because we'll be plotting onto a 3D surface, we convert sampleX0 and X1 to meshgrids
    X0, X1 = np.meshgrid(sampleX0, sampleX1)

    numerator = 0
    # Now compute for X2
    for j in range(len(df.columns)):
      if j == 0:
        numerator += X0*params[0]
        continue
      elif j == 1:
        numerator += X1*params[1]
        continue
      elif j == 2: continue # as x_2 is the one we're solving for

      numerator += params[j]*means[j]

    # Solve for X2 using the decision boundary equation:
    # intercept + params[0]*X0 + params[1]*X1 + params[2]*X2 + fixed = 0
    X2 = -1 * numerator / params[2]

    # Take a 3d slice of the feature space
    H3D = intercept + params[0]*X0 + params[1]*X1 + params[2]*X2

    return [H3D, X0, X1, X2]
  
  def findSampleX(self, df):
    # drop diagnosis
    features = df.columns.values.tolist()
    means = [] # array to store feature means

    # iterate through dataset
    for j in range(len(df.columns)):
      mean = df.loc[:, features[j]].mean()
      means.append(mean) # add mean to means
    
    sampleX0 = np.linspace(-1, 3, 400)
    sampleX1 = np.linspace(-1, 3, 400)
    return [sampleX0, sampleX1, means]
  
  def plotClassBoundary3D(self, df):
    import numpy as np
    import matplotlib.pyplot as plt
    from mpl_toolkits.mplot3d import Axes3D

    # Get the 2D grid and decision boundary from computeClassBound3D.
    # Here, X0, X1, and X2 are already 2D arrays.
    H3D, X0, X1, X2 = self.computeClassBound3D(df)
    
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(111, projection='3d')
    
    # Plot the decision boundary surface (where the decision function is zero)
    boundary_surface = ax.plot_surface(X0, X1, X2, cmap='coolwarm', alpha=0.7)
    
    # Shade the region where the decision function is positive (Y = 1)
    # We project a filled contour onto a plane below the surface.
    z_offset = np.min(X2) - 1  # adjust this as needed
    region_contour = ax.contourf(X0, X1, H3D, zdir='z', offset=z_offset, 
                                 levels=[0, np.max(H3D)], cmap='Reds', alpha=0.5)
    
    ax.set_xlabel('Feature X0')
    ax.set_ylabel('Feature X1')
    ax.set_zlabel('Feature X2')
    ax.set_title('3D Decision Boundary and Region Y=1')
    plt.show()



          

In [4]:
######### SVM #########
# https://scikit-learn.org/1.5/modules/svm.html#classification
# https://scikit-learn.org/1.5/modules/svm.html#svc (mathematical theory)

from sklearn import svm

class supVecMac(baseModel):
  def __init__(self):
    super().__init__(svm.LinearSVC(), "svmModel")


In [5]:
######### Naive Bayes #########
from sklearn import naive_bayes

class naiBayClass(baseModel):
  def __init__(self):
    super().__init__(naive_bayes.GaussianNB(), "nbcModel")


In [6]:
######### K-NN #########
# https://scikit-learn.org/1.5/modules/neighbors.html#nearest-neighbors-classification
from sklearn import neighbors

class kNearNeigh(baseModel):
  def __init__(self):
    super().__init__(neighbors.KNeighborsClassifier(algorithm = 'ball_tree', metric = 'manhattan'), "knnModel")
    # Reason for using ball-tree algorithm: https://scikit-learn.org/1.5/modules/neighbors.html#Ball_Tree
    # See lit review for the reasoning behind the chosen distance metric
    # Also: https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.pairwise.distance_metrics.html#sklearn.metrics.pairwise.distance_metrics


# Below was last updated 27/12/2024
# Manhattan distance: {'accuracy': 96.49122807017544, 'specificity': 100.0, 'sensitivity': 89.74358974358975, 'recall': 89.74358974358975, 'precision': 100.0}
# Euclidean distance: {'accuracy': 95.6140350877193, 'specificity': 97.33333333333334, 'sensitivity': 92.3076923076923, 'recall': 92.3076923076923, 'precision': 94.73684210526315}
# Cityblock distance: {'accuracy': 96.49122807017544, 'specificity': 100.0, 'sensitivity': 89.74358974358975, 'recall': 89.74358974358975, 'precision': 100.0}
# Haversine distance: Not valid for > 2 dimensions
# L1 distance: {'accuracy': 96.49122807017544, 'specificity': 100.0, 'sensitivity': 89.74358974358975, 'recall': 89.74358974358975, 'precision': 100.0}
# L2 distance: {'accuracy': 95.6140350877193, 'specificity': 97.33333333333334, 'sensitivity': 92.3076923076923, 'recall': 92.3076923076923, 'precision': 94.73684210526315}

# From above, can be seen that Manhattan/L1/Cityblock yield the best results.

In [7]:
######### Decision Tree #########
# https://scikit-learn.org/1.5/modules/tree.html#classification
# https://scikit-learn.org/1.5/modules/tree.html#mathematical-formulation (mathematical theory)

# SKLearn defaultly uses an optimised implementation of the CART decision tree algorithm.
from sklearn import tree

class decTree(baseModel):
  def __init__(self):
    super().__init__(tree.DecisionTreeClassifier(criterion='gini', splitter='best'), "dtModel")


In [8]:
# 06/02/2025
def runModels(Xtrain, Ytrain, Xtest, modes = ["train"], inModels = None):
  regModel, svmModel, nbcModel, knnModel, dtModel = logReg(), supVecMac(), naiBayClass(), kNearNeigh(), decTree()
  Ypreds = {}
  models = {}

  if inModels == None and ("test" in modes and "train" not in modes):
      raise Exception("Error: Cannot test models without training them. Please either add 'train' to the modes list, or supply the trained models in the `models` param list.")

  models = inModels if inModels != None else {}

  if "train" in modes and models == {}:
    Ypreds["Ypred_regModel"], models["regModel"] = regModel.train(Xtrain, Ytrain)
    Ypreds["Ypred_svmModel"], models["svmModel"] = svmModel.train(Xtrain, Ytrain)
    Ypreds["Ypred_nbcModel"], models["nbcModel"] = nbcModel.train(Xtrain, Ytrain)
    Ypreds["Ypred_knnModel"], models["knnModel"] = knnModel.train(Xtrain, Ytrain)
    Ypreds["Ypred_dtModel"], models["dtModel"] = dtModel.train(Xtrain, Ytrain)

  if "test" in modes:
    for model in models:
      Ypreds["Ypred_" + model] = models[model].test(Xtest)

  return [Ypreds, models]
