# Leveraging CatBoost to identify features in recurrence

This notebook leverages CatBoost to analyze clinicopathologic features from a thyroid cancer dataset. The primary goal is to identify the most significant features influencing thyroid cancer recurrence. The notebook will leverage GPU whenever performance can be boosted.  A pdf findings summary will be provided.

## UCI Dataset

Dataset: [UCI Machine Learning Repository - Differentiated Thyroid Cancer Recurrence Dataset](https://archive.ics.uci.edu/dataset/915/differentiated+thyroid+cancer+recurrence)

Citation: Borzooei,Shiva and Tarokhian,Aidin. (2023). Differentiated Thyroid Cancer Recurrence. UCI Machine Learning Repository. https://doi.org/10.24432/C5632J.

## Google colab

You may run this notebook on Google Colab by clicking the "Open in Colab" badge below:

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/OpenBioResearch/disease-focused-uci-ml-repos/blob/main/thyroid_cancer_recurrence_catboost.ipynb)


In [None]:
# Import necessary libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Install ucimlrepo package

!pip install ucimlrepo

from ucimlrepo import fetch_ucirepo

# Fetch dataset from UCI ML Repository
differentiated_thyroid_cancer_recurrence = fetch_ucirepo(id=915)

# Data (as pandas dataframes)
X = differentiated_thyroid_cancer_recurrence.data.features
y = differentiated_thyroid_cancer_recurrence.data.targets

print(differentiated_thyroid_cancer_recurrence.metadata)
print(differentiated_thyroid_cancer_recurrence.variables)

In [None]:
# Features and target dataframe

print("Features dataframe (X):")
display(X.head())

print("Target dataframe (y):")
display(y.head())

In [None]:
# splits the data into training and testing sets.

missing_values_X = X.isnull().sum()
missing_values_y = y.isnull().sum()
print("Missing values in features (X):\n", missing_values_X)
print("Missing values in target (y):\n", missing_values_y)

for column in X.columns:
    if X[column].dtype == 'object':
        X[column].fillna(X[column].mode()[0], inplace=True)
    else:
        X[column].fillna(X[column].median(), inplace=True)

# Encode categorical features using one-hot encoding
X_encoded = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

print("-------------------")
print("Data Split Summary:")
print(f"Training Set: {len(X_train)} samples")
print(f"Testing Set: {len(X_test)} samples")
print("-------------------")
print("Data preprocessing complete. Ready for modeling.")


In [None]:
# Model Training with CatBoost
!pip install catboost

from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, classification_report
import torch

# Check for GPU availability and set device
if torch.cuda.is_available():
    device = torch.device('cuda')
    task_type = "GPU"
    print("Training on GPU")
else:
    device = torch.device('cpu')
    task_type = "CPU"
    print("Training on CPU")

# Initialize the CatBoostClassifier
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    task_type=task_type
)

# Prepare the Pool data structure for CatBoost
train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)

# Train the model with a progress indicator
model.fit(train_pool, eval_set=test_pool, verbose=100, plot=True)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")



In [None]:
from sklearn.metrics import confusion_matrix

def generate_feature_importance_report(model, X_train):
    # Get feature importances
    feature_importances = model.get_feature_importance(Pool(X_train, y_train))
    feature_names = X_train.columns

    # Create a DataFrame for feature importances
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)

    display(importance_df)

    plt.figure(figsize=(12, 10))
    plt.barh(importance_df['Feature'], importance_df['Importance'])
    plt.xlabel("Feature Importance")
    plt.title("Detailed Feature Importance Analysis")
    plt.gca().invert_yaxis()  # Invert y-axis to show the most important feature at the top
    plt.show()

    return importance_df

# Generate the feature importance report
importance_df = generate_feature_importance_report(model, X_train)

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

y_pred_proba = model.predict_proba(X_test)[:, 1]

plt.figure(figsize=(10, 7))
sns.histplot(y_pred_proba, bins=30, kde=True)
plt.xlabel('Predicted Probability of Recurrence')
plt.title('Distribution of Predicted Probabilities')
plt.show()


In [None]:
!pip install fpdf
from fpdf import FPDF
from datetime import date

def generate_pdf_report(accuracy, report, importance_df, cm, y_pred_proba):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    # Executive Summary
    pdf.cell(200, 10, txt="Executive Summary", ln=True, align="C")
    pdf.cell(200, 10, txt="This report summarizes the analysis of thyroid cancer recurrence using CatBoost.", ln=True, align="L")
    pdf.multi_cell(0, 10, txt=f"The model achieved an accuracy of {accuracy*100:.2f}%. This indicates how well the model can predict whether thyroid cancer will recur. ", align="L")
    pdf.multi_cell(0, 10, txt=f"The classification report provides a detailed breakdown of the model's performance for each class (recurrence or no recurrence):\n{report}", align="L")
    pdf.multi_cell(0, 10, txt="Precision measures how many of the positive predictions made by the model were actually correct. For instance, a precision of 0.8 for recurrence means 80% of the cases predicted as recurrence were actually recurrences.", align="L")
    pdf.multi_cell(0, 10, txt="Recall measures how many of the actual positive cases the model was able to correctly identify. A high recall indicates the model is good at detecting actual recurrence cases.", align="L")
    pdf.multi_cell(0, 10, txt="The F1-score, a harmonic mean of precision and recall, provides a balanced assessment of the model's performance. It considers both false positives and false negatives.", align="L")
    pdf.multi_cell(0, 10, txt="Support is the number of samples in each class (recurrence and no recurrence). It gives an idea of the data distribution.", align="L")

    # Feature Importance Plot
    plt.figure(figsize=(12, 10))
    plt.barh(importance_df['Feature'], importance_df['Importance'])
    plt.xlabel("Feature Importance")
    plt.title("Detailed Feature Importance Analysis")
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig("feature_importance.png")
    pdf.image("feature_importance.png", w=180)
    pdf.multi_cell(0, 10, txt="The feature importance plot above shows the relative importance of each feature in predicting recurrence. By identifying the most important features, we can gain insights into the key factors driving recurrence and potentially focus on these for further investigation or treatment strategies.", align="L")
    plt.close()

    # Confusion Matrix
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.savefig("confusion_matrix.png")
    pdf.image("confusion_matrix.png", w=150)
    pdf.multi_cell(0, 10, txt="The confusion matrix illustrates the model's performance by showing the number of true positives, true negatives, false positives, and false negatives. This helps us understand where the model is making correct predictions and where it's making errors, allowing for targeted improvement.", align="L")
    plt.close()

    # Distribution of Predicted Probabilities
    plt.figure(figsize=(10, 7))
    sns.histplot(y_pred_proba, bins=30, kde=True)
    plt.xlabel('Predicted Probability of Recurrence')
    plt.title('Distribution of Predicted Probabilities')
    plt.savefig("predicted_probabilities.png")
    pdf.image("predicted_probabilities.png", w=150)
    pdf.multi_cell(0, 10, txt="The distribution of predicted probabilities shows how confident the model is in its predictions. A higher peak towards the right indicates greater confidence in predicting recurrence, which can be useful for clinical decision-making.", align="L")
    plt.close()

    pdf.output("thyroid_cancer_report.pdf")

# Call the function to generate the PDF
generate_pdf_report(accuracy, report, importance_df, cm, y_pred_proba)