In [None]:
#dataset from kagglehub
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/pima-indians-diabetes-database")

print("Path to dataset files:", path)

In [None]:
#imports
import pandas as pd
import numpy as np

#dataset loading
buffer = path + '/diabetes.csv' #Honestly might as well just attach a local copy of the dataset
try:
   initial_diabetes = pd.read_csv(buffer)
except:
   initial_diabetes = pd.read_csv('/diabetes.csv') #Which is what exactly this is for(local copy)

#What are the attributes
print("Dataset Shape:", initial_diabetes.shape)
initial_diabetes.head()

In [None]:
#Cleaning the data/Preprocessing
diabetes = initial_diabetes.dropna() #Removes rows with empty cells(an entirely new DataFrame)
diabetes.drop_duplicates(inplace = True) #Removes duplicate rows #(inplace = True) will make sure that the method does NOT return a new DataFrame

In [None]:
#Visualize pairplots to see the distribution of attributes for disease outcome
import seaborn as sns
import matplotlib.pyplot as plt
sns.pairplot(diabetes, hue='Outcome')
plt.show()

In [None]:
#Box and whiskers
print("\nBoxplots of features grouped by target:")
plt.figure(figsize=(15, 30))
for i, col in enumerate(diabetes.columns[:-1]):
    plt.subplot(4, 2, i+1)
    sns.boxplot(x='Outcome', y=col, data=diabetes)
    plt.title(f'Boxplot of {col} by Outcome')
plt.tight_layout()
plt.show()


In [None]:
#Preparing x and y
x = diabetes.drop(columns=['Outcome']) #Attributes
y = diabetes['Outcome'] #Target

#Train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
#Random State Controls the shuffling applied to the data before applying the split but only if there is even shuffling(Default is true).
#Test Size: If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. The Train size not included should be the complement of Test size.

#diabetes with column train
diabetes_train = {
    'attributes': x_train, #attribute table
    'target': y_train #target table
}

#diabetes with column test
diabetes_test = {
  'attributes': x_test,
  'target': y_test
}


In [None]:
#Training portion
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(criterion='gini', random_state=0) #Testing if setting random state to an Int will alter its deterministic nature(It does) (The seed 100 generates a better accuracy result than 0)
dtc.fit(diabetes_train['attributes'], diabetes_train['target'])

In [None]:
#Testing portion
predict = dtc.predict(diabetes_test['attributes'])
predict

In [None]:
pd.DataFrame(list(zip(diabetes_test['target'], predict)), columns=['Target', 'Predicted'])
#first part of the parameter is data input with indexing, second part is just the columns set

In [None]:
accuracy = dtc.score(diabetes_test['attributes'], diabetes_test['target'])
print(f'Accuracy: {accuracy:.6f}') #to the degree of 6 floating points

In [None]:
import matplotlib.pyplot as plt
max_depths = [1,5,8,13,21,34]
training_accuracy =[]
testing_accuracy = []
for md in max_depths:
  dtc = DecisionTreeClassifier(max_depth=md)
  dtc.fit(diabetes_train['attributes'],diabetes_train['target'])
  train = dtc.score(diabetes_train['attributes'],diabetes_train['target'])
  test = dtc.score(diabetes_test['attributes'],diabetes_test['target'])
  training_accuracy.append(train)
  testing_accuracy.append(test)
plt.plot(max_depths, training_accuracy, label='training accuracy')
plt.plot(max_depths, testing_accuracy, label='testing accuracy')
plt.xlabel('max depth')
plt.ylabel('accuracy')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc

max_depths = [1,5,8,13,21,34]
plt.figure(figsize=(10, 8))  # Set figure size

# Loop through each max_depth
for md in max_depths:
    dtc = DecisionTreeClassifier(max_depth=md)
    dtc.fit(diabetes_train['attributes'], diabetes_train['target'])

    # Predict probabilities for the test set
    y_prob = dtc.predict_proba(diabetes_test['attributes'])[:, 1]  # Probabilities for the positive class #Index Slicing

    # Compute the ROC curve and AUC
    fpr, tpr, thresholds = roc_curve(diabetes_test['target'], y_prob, pos_label=None) #pos_label:The label of the positive class. When pos_label=None, if y_true is in {-1, 1} or {0, 1}, pos_label is set to 1, otherwise an error will be raised.
    roc_auc = auc(fpr, tpr) #Area under curve

    # Plot the ROC curve
    plt.plot(fpr, tpr, label=f'max_depth={md}, AUC={roc_auc:.2f}') #true positive, false postive

# Plot diagonal line for reference
plt.plot([0, 1], [0, 1], 'k--', label='Reference')

# Add labels, legend, and title
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve for Different max_depths')
plt.legend()
plt.grid()
plt.show()

In [None]:
#Train again for the plotting
dtc = DecisionTreeClassifier(criterion='gini', random_state=0)
dtc.fit(diabetes_train['attributes'], diabetes_train['target'])

In [None]:
#How does the decision tree look like
from sklearn.tree import plot_tree
plt.figure(figsize=[100,100]) #The visualization is fit automatically to the size of the axis. Use the figsize or dpi arguments of plt.figure to control the size of the rendering.
tree = plot_tree(dtc, feature_names=x.columns.tolist(), #class_name not needed since result is binary
          filled=True, rounded=True) #filled: When set to True, paint nodes to indicate majority class for classification, extremity of values for regression, or purity of node for multi-output.

In [None]:
#Define Colormap for Visualization
from matplotlib import cm
from matplotlib.colors import ListedColormap
colormap = cm.get_cmap('tab20')
cm_dark = ListedColormap(colormap.colors[::2])
cm_light = ListedColormap(colormap.colors[1::2])

In [None]:
#Initialize Accuracy Storage Variables
all_acc = []
all_acc_cols = []

In [None]:
#Generate Feature Combinations
att_cols = diabetes_train['attributes'].columns
all_comb = []
for horiz in att_cols:
  for vert in att_cols:
    if horiz is vert or [horiz,vert] in all_comb or [vert, horiz] in all_comb:
      continue
    all_comb.append([horiz, vert])

In [None]:
import numpy as np
max_depth = None
dtc = DecisionTreeClassifier(random_state=100, max_depth = 34) #can't afford to unlimited max depth due to sheer volume of the dataset
#Iterate through all feature pairs
for i, [h,v] in enumerate(all_comb):
  fig, ax = plt.subplots(1, 2, figsize = [40,20])

  dtc.fit(diabetes_train['attributes'][[h,v]], diabetes_train['target'])
  plot_tree(dtc, feature_names = diabetes_train['attributes'][[h,v]].columns.to_list(),
            ax=ax[0], filled=True, rounded=True)

  x_min = diabetes_train['attributes'][h].min()
  x_max = diabetes_train['attributes'][h].max()
  x_range = x_max - x_min
  x_min = x_min - 0.1 * x_range
  x_max = x_max + 0.1 * x_range
  y_min = diabetes_train['attributes'][v].min()
  y_max = diabetes_train['attributes'][v].max()
  y_range = y_max - y_min
  y_min = y_min - 0.1 * y_range
  y_max = y_max + 0.1 * y_range
  xx, yy = np.meshgrid(np.arange(x_min, x_max, .01*y_range), np.arange(y_min,y_max, .01*y_range))
  z = dtc.predict(list(zip(xx.ravel(), yy.ravel())))
  z = z.reshape(xx.shape)

  plt.sca(ax[1])
  plt.pcolormesh(xx,yy,z,cmap=cm_light)

  plt.rcParams.update({'font.size': 30})
  ax[1].scatter(diabetes_train['attributes'][h], diabetes_train['attributes'][v],
                c=diabetes_train['target'], cmap=cm_dark, s=200,
                label='Training data', edgecolor='black', linewidth=1)
  ax[1].scatter(diabetes_test['attributes'][h], diabetes_test['attributes'][v],
                c=diabetes_test['target'], cmap=cm_dark, s=200,
                label = 'Testing data', edgecolor='black', linewidth=1, marker='*')
  train_acc = dtc.score(diabetes_train['attributes'][[h,v]], diabetes_train['target'])
  test_acc = dtc.score(diabetes_test['attributes'][[h,v]], diabetes_test['target'])
  ax[1].set_title(f'training:{train_acc:.3f}, testing:{test_acc:.3f}')
  ax[1].set_xlabel(h)
  ax[1].set_ylabel(v)
  ax[1].legend()

  all_acc.append([1, h, v, max_depth, train_acc, test_acc])

all_acc_cols = ['i', 'attribute 1', 'attribute 2', 'max depth 1', 'training accuracy 1', 'testing accuracy 1']





In [None]:
import numpy as np
max_depth = 3
dtc = DecisionTreeClassifier(random_state=100, max_depth = max_depth)
for i, [h,v] in enumerate(all_comb):
  fig, ax = plt.subplots(1, 2, figsize = [40,20])

  dtc.fit(diabetes_train['attributes'][[h,v]], diabetes_train['target'])
  plot_tree(dtc, feature_names = diabetes_train['attributes'][[h,v]].columns.to_list(),
            ax=ax[0], filled=True, rounded=True)

  x_min = diabetes_train['attributes'][h].min()
  x_max = diabetes_train['attributes'][h].max()
  x_range = x_max - x_min
  x_min = x_min - 0.1 * x_range
  x_max = x_max + 0.1 * x_range
  y_min = diabetes_train['attributes'][v].min()
  y_max = diabetes_train['attributes'][v].max()
  y_range = y_max - y_min
  y_min = y_min - 0.1 * y_range
  y_max = y_max + 0.1 * y_range
  xx, yy = np.meshgrid(np.arange(x_min, x_max, .01*y_range), np.arange(y_min,y_max, .01*y_range))
  z = dtc.predict(list(zip(xx.ravel(), yy.ravel())))
  z = z.reshape(xx.shape)

  plt.sca(ax[1])
  plt.pcolormesh(xx,yy,z,cmap=cm_light)

  plt.rcParams.update({'font.size': 30})
  ax[1].scatter(diabetes_train['attributes'][h], diabetes_train['attributes'][v],
                c=diabetes_train['target'], cmap=cm_dark, s=200,
                label='Training data', edgecolor='black', linewidth=1)
  ax[1].scatter(diabetes_test['attributes'][h], diabetes_test['attributes'][v],
                c=diabetes_test['target'], cmap=cm_dark, s=200,
                label = 'Testing data', edgecolor='black', linewidth=1, marker='*')
  train_acc = dtc.score(diabetes_train['attributes'][[h,v]], diabetes_train['target'])
  test_acc = dtc.score(diabetes_test['attributes'][[h,v]], diabetes_test['target'])
  ax[1].set_title(f'training:{train_acc:.3f}, testing:{test_acc:.3f}')
  ax[1].set_xlabel(h)
  ax[1].set_ylabel(v)
  ax[1].legend()

  all_acc[i] += [max_depth, train_acc, test_acc]

all_acc_cols += ['max depth 2', 'training accuracy 2', 'testing accuracy 2']





In [None]:
acc_table = pd.DataFrame(all_acc, columns=['i', 'attribute 1', ' attribute 2', 'max depth 1', ' train accuracy 1', 'test accuracy 1',
                                           'max depth 2', 'train accuracy 2', ' test accuracy 2'])
acc_table