### Disclaimer: 
Please note that due to shuffling in an earlier stage, the accuracy rates may vary slightly from those presented in our initial presentation and our main complete codebase. We have taken steps to minimize such discrepancies in the future. We apologize for any inconsistencies.

# Part 4: Decision Tree Classification

We first used __Decision Tree Classifier__ on each of the four train/test data sets we have initialized above. In the following way, we will determine accuracies and demonstrate the confusion matrix and decision tree upon concluding this section.

### Importing libraries and datasets

In [None]:
# For data processing
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
sb.set(style='whitegrid')

# Read feature DataFrames (without the '2' prefix)
X_train = pd.read_csv('X1_train.csv')
X_test = pd.read_csv('X1_test.csv')
X2_train = pd.read_csv('X2_train.csv')
X2_test = pd.read_csv('X2_test.csv')
X3_train = pd.read_csv('X3_train.csv')
X3_test = pd.read_csv('X3_test.csv')
X4_train = pd.read_csv('X4_train.csv')
X4_test = pd.read_csv('X4_test.csv')

# Read target Series
y_train = pd.read_csv('y1_train.csv').squeeze()
y_test = pd.read_csv('y1_test.csv').squeeze()
y2_train = pd.read_csv('y2_train.csv').squeeze()
y2_test = pd.read_csv('y2_test.csv').squeeze()
y3_train = pd.read_csv('y3_train.csv').squeeze()
y3_test = pd.read_csv('y3_test.csv').squeeze()
y4_train = pd.read_csv('y4_train.csv').squeeze()
y4_test = pd.read_csv('y4_test.csv').squeeze()

In [None]:
# Decision Tree using Train Data
import os
print("OMP_NUM_THREADS =", os.environ.get("OMP_NUM_THREADS"))
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

dectree = DecisionTreeClassifier(max_depth = 4)  # create the decision tree object

In [None]:
dectree.fit(X_train, y_train)

### 1.1. Decision Tree Classifier for __X_train__ dataset.

In [None]:
# Predict Response corresponding to Predictors
y_train_pred1 = dectree.predict(X_train)
y_test_pred1 = dectree.predict(X_test)

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Classification Accuracy \t:", dectree.score(X_train, y_train))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Classification Accuracy \t:", dectree.score(X_test, y_test))
print()

# Plot the Confusion Matrix for Train and Test
f, axes = plt.subplots(1, 2, figsize=(12, 4))
sb.heatmap(confusion_matrix(y_train, y_train_pred1),
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[0])
sb.heatmap(confusion_matrix(y_test, y_test_pred1), 
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[1])

from sklearn.metrics import confusion_matrix

# Get confusion matrix values for train and test
cm_train = confusion_matrix(y_train, y_train_pred1)
cm_test = confusion_matrix(y_test, y_test_pred1)

def print_rates(cm, dataset_name=""):
    TN, FP, FN, TP = cm.ravel()

    FNR = FN / (FN + TP) if (FN + TP) != 0 else 0
    FPR = FP / (FP + TN) if (FP + TN) != 0 else 0
    TPR = TP / (TP + FN) if (TP + FN) != 0 else 0
    TNR = TN / (TN + FP) if (TN + FP) != 0 else 0

    print(f"Rates for {dataset_name} Dataset:")
    print(f"False Negative Rate (FNR): {FNR:.4f}")
    print(f"False Positive Rate (FPR): {FPR:.4f}")
    print(f"True Positive Rate (TPR / Recall): {TPR:.4f}")
    print(f"True Negative Rate (TNR / Specificity): {TNR:.4f}")
    print()

# Print rates for both train and test
print_rates(cm_train, "Train")
print_rates(cm_test, "Test")


In [None]:
from sklearn.tree import plot_tree

f = plt.figure(figsize=(30,15))
plot_tree(dectree, filled=True, rounded=True, 
          feature_names=X_train.columns, 
          class_names=["Age", "Gender", "Smoking History"])

### 1.2. Decision Tree Classifier for __X2_train__ dataset.

In [None]:
dectree2 = DecisionTreeClassifier(max_depth = 4)
dectree2.fit(X2_train, y2_train)

In [None]:
from sklearn.tree import plot_tree

f = plt.figure(figsize=(40,12))
plot_tree(dectree2, filled=True, rounded=True, 
          feature_names=X2_train.columns, 
          class_names=["Age", "Gender", "Currently Smoking", "Smoking History", "Adenopathy"])


In [None]:
# Predict Response corresponding to Predictors
y_train_pred2 = dectree2.predict(X2_train)
y_test_pred2 = dectree2.predict(X2_test)

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Classification Accuracy \t:", dectree2.score(X2_train, y2_train))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Classification Accuracy \t:", dectree2.score(X2_test, y2_test))
print()

# Plot the Confusion Matrix for Train and Test
f, axes = plt.subplots(1, 2, figsize=(12, 4))
sb.heatmap(confusion_matrix(y2_train, y_train_pred2),
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[0])
sb.heatmap(confusion_matrix(y2_test, y_test_pred2), 
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[1])

from sklearn.metrics import confusion_matrix

# Get confusion matrix values for train and test
cm_train2 = confusion_matrix(y2_train, y_train_pred2)
cm_test2 = confusion_matrix(y2_test, y_test_pred2)

# Print rates for both train and test
print_rates(cm_train2, "Train")
print_rates(cm_test2, "Test")


### 1.3. Decision Tree Classifier for __X3_train__ dataset.

In [None]:
dectree3 = DecisionTreeClassifier(max_depth = 4)
dectree3.fit(X3_train, y3_train)

In [None]:
from sklearn.tree import plot_tree

f = plt.figure(figsize=(40,12))
plot_tree(dectree3, filled=True, rounded=True, 
          feature_names=X3_train.columns, 
          class_names=["Age", "Gender", "Currently Smoking", "Smoking History", "Adenopathy", "Risk", "Treatment Response"])

In [None]:
# Predict Response corresponding to Predictors
y_train_pred3 = dectree3.predict(X3_train)
y_test_pred3 = dectree3.predict(X3_test)

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Classification Accuracy \t:", dectree3.score(X3_train, y3_train))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Classification Accuracy \t:", dectree3.score(X3_test, y3_test))
print()

# Plot the Confusion Matrix for Train and Test
f, axes = plt.subplots(1, 2, figsize=(12, 4))
sb.heatmap(confusion_matrix(y3_train, y_train_pred3),
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[0])
sb.heatmap(confusion_matrix(y3_test, y_test_pred3), 
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[1])

# Get confusion matrix values for train and test
cm_train3 = confusion_matrix(y3_train, y_train_pred3)
cm_test3 = confusion_matrix(y3_test, y_test_pred3)

# Print rates for both train and test
print_rates(cm_train3, "Train")
print_rates(cm_test3, "Test")

### 1.4. Decision Tree Classifier for __X4_train__ dataset.

In [None]:
dectree4 = DecisionTreeClassifier(max_depth = 4)
dectree4.fit(X4_train, y4_train)

In [None]:
from sklearn.tree import plot_tree

f = plt.figure(figsize=(40,12))
plot_tree(dectree4, filled=True, rounded=True, 
          feature_names=X4_train.columns, 
          class_names=["Age", "Gender", "Currently Smoking", "Smoking History", "Radiotherapy History", "Thyroid Function", "Physical Examination", "Adenopathy", 
                    "Types of Thyroid Cancer (Pathology)", "Focality", "Risk", "Tumor", 
                    "Lymph Nodes", "Cancer Metastasis", "Stage", "Treatment Response"])

In [None]:
# Predict Response corresponding to Predictors
y_train_pred4 = dectree4.predict(X4_train)
y_test_pred4 = dectree4.predict(X4_test)

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Classification Accuracy \t:", dectree4.score(X4_train, y4_train))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Classification Accuracy \t:", dectree4.score(X4_test, y4_test))
print()

# Plot the Confusion Matrix for Train and Test
f, axes = plt.subplots(1, 2, figsize=(12, 4))
sb.heatmap(confusion_matrix(y4_train, y_train_pred4),
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[0])
sb.heatmap(confusion_matrix(y4_test, y_test_pred4), 
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[1])

# Get confusion matrix values for train and test
cm_train4 = confusion_matrix(y4_train, y_train_pred4)
cm_test4 = confusion_matrix(y4_test, y_test_pred4)

# Print rates for both train and test
print_rates(cm_train4, "Train")
print_rates(cm_test4, "Test")

### Test Set Performance Summary

| Metric                              | X1 Test | X2 Test | X3 Test | X4 Test |
|-------------------------------------|---------|---------|---------|---------|
| **Classification Accuracy**         | 0.7604  | 0.8229  | 0.9271  | 0.9167  |
| **False Negative Rate (FNR)**       | 0.8000  | 0.4231  | 0.1154  | 0.2333  |
| **False Positive Rate (FPR)**       | 0.0423  | 0.0857  | 0.0571  | 0.0152  |
| **True Positive Rate (TPR/Recall)** | 0.2000  | 0.5769  | 0.8846  | 0.7667  |
| **True Negative Rate (TNR)**        | 0.9577  | 0.9143  | 0.9429  | 09848  |


Test accuracy ranges from __76.04% to 92.71%__. Models trained on __X3 (92.71%)__ and __X4 (91.67%)__ clearly outperform the others, showing __significantly lower FNRs__ (11.54% and 23.33%) and the __lowest FPRs__ â€” particularly X4 with just 1.52%. Both models also achieve high recall (88.46% and 76.67%) and excellent specificity, with X4 achieving the highest TNR of 98.48%. These results make X3 and X4 the __most accurate and reliable models__ in the group.
