In [None]:

!pip install catboost

# Import necessary libraries
import pandas as pd
from catboost import CatBoostClassifier, Pool
import matplotlib.pyplot as plt

file_path = '/content/thyroid_dataset/thyroid.xlsx'
data = pd.read_excel(file_path)

# Display the first few rows of the dataset to understand its structure
data.head()


In [None]:
# Data Preprocessing
# This cell handles missing values, encodes categorical features, and splits the data into training and testing sets.

# Check for missing values in the dataset
missing_values = data.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Handle missing values (if any)
# For simplicity, let's fill missing values with the median for numerical features and the mode for categorical features.
for column in data.columns:
    if data[column].dtype == 'object':
        data[column].fillna(data[column].mode()[0], inplace=True)
    else:
        data[column].fillna(data[column].median(), inplace=True)

# Since the initial data preview didn't show a 'target' column, let's identify the correct target variable.
# We need to adjust based on the actual column names in the dataset.
# For demonstration, let's assume 'Recurrence' is the target variable. Adjust as necessary.

# Example placeholder for actual target column (adjust as needed)
target_column = 'Recurrence'  # Replace with the correct target column name

# Encode categ


# Section: Model Training with CatBoost

In this section, we will train a CatBoost model using the preprocessed data. We will leverage the GPU for training to enhance performance. The goal is to identify the most important features influencing thyroid cancer recurrence.


In [None]:
# Inspecting Columns
# This cell prints the column names of the dataset to identify the correct target variable.

# Load the dataset
file_path = '/content/thyroid_dataset/thyroid.xlsx'
data = pd.read_excel(file_path)

# Print the column names
print("Column names in the dataset:\n", data.columns)


In [None]:
# Inspecting Sheets
# This cell lists all the sheets in the Excel file to identify where the actual clinical data is located.

# Load the Excel file
file_path = '/content/thyroid_dataset/thyroid.xlsx'
xls = pd.ExcelFile(file_path)

# List all sheet names
sheet_names = xls.sheet_names
print("Sheet names in the Excel file:\n", sheet_names)

# Load and display the first few rows of each sheet to identify the relevant one
for sheet in sheet_names:
    print(f"\nContent of sheet '{sheet}':")
    display(pd.read_excel(file_path, sheet_name=sheet).head())


In [None]:
# Install ucimlrepo package
!pip install ucimlrepo

# Import the necessary module
from ucimlrepo import fetch_ucirepo

# Fetch dataset from UCI ML Repository
differentiated_thyroid_cancer_recurrence = fetch_ucirepo(id=915)

# Data (as pandas dataframes)
X = differentiated_thyroid_cancer_recurrence.data.features
y = differentiated_thyroid_cancer_recurrence.data.targets

# Metadata
print(differentiated_thyroid_cancer_recurrence.metadata)

# Variable information
print(differentiated_thyroid_cancer_recurrence.variables)


In [None]:
# Confirming Dataset Retrieval
# This cell inspects the first few rows of the features and target dataframes to ensure they are suitable for our thyroid cancer project.

# Display the first few rows of the features dataframe
print("Features dataframe (X):")
display(X.head())

# Display the first few rows of the target dataframe
print("Target dataframe (y):")
display(y.head())


In [None]:
# Data Preprocessing
# This cell handles missing values, encodes categorical features, and splits the data into training and testing sets.

# Check for missing values in the dataset
missing_values_X = X.isnull().sum()
missing_values_y = y.isnull().sum()
print("Missing values in features (X):\n", missing_values_X)
print("Missing values in target (y):\n", missing_values_y)

# Handle missing values (if any)
for column in X.columns:
    if X[column].dtype == 'object':
        X[column].fillna(X[column].mode()[0], inplace=True)
    else:
        X[column].fillna(X[column].median(), inplace=True)

# Encode categorical features using one-hot encoding
X_encoded = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

print("Data preprocessing complete. Ready for modeling.")


In [None]:
# Model Training with CatBoost
# This cell trains a CatBoost model using the preprocessed data and evaluates its performance.

# Install the CatBoost library if not already installed
!pip install catboost

# Import necessary libraries
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

# Initialize the CatBoostClassifier
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    task_type="GPU"  # Leverage GPU for training
)

# Prepare the Pool data structure for CatBoost
train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)

# Train the model with a progress indicator
model.fit(train_pool, eval_set=test_pool, verbose=100, plot=True)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

# Plot feature importance
feature_importances = model.get_feature_importance(Pool(X_train, y_train))
feature_names = X_train.columns

plt.figure(figsize=(10, 8))
plt.barh(feature_names, feature_importances)
plt.xlabel("Feature Importance")
plt.title("Feature Importance Analysis")
plt.show()


# Section: Model Evaluation and Analysis

In this section, we will evaluate the performance of the trained CatBoost model and analyze the feature importance to gain insights into the factors influencing thyroid cancer recurrence.


In [None]:
# Model Evaluation and Analysis
# This cell evaluates the performance of the trained CatBoost model and analyzes the feature importance.

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

# Plot feature importance
feature_importances = model.get_feature_importance(Pool(X_train, y_train))
feature_names = X_train.columns

plt.figure(figsize=(10, 8))
plt.barh(feature_names, feature_importances)
plt.xlabel("Feature Importance")
plt.title("Feature Importance Analysis")
plt.show()


# Section: Visualizing Prediction Results

In this section, we will visualize the prediction results of the trained CatBoost model. This includes plotting a confusion matrix and visualizing the distribution of predicted probabilities.


In [None]:
# Visualizing Prediction Results
# This cell visualizes the prediction results by plotting a confusion matrix and the distribution of predicted probabilities.

import seaborn as sns
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Plot the distribution of predicted probabilities
y_pred_proba = model.predict_proba(X_test)[:, 1]

plt.figure(figsize=(10, 7))
sns.histplot(y_pred_proba, bins=30, kde=True)
plt.xlabel('Predicted Probability of Recurrence')
plt.title('Distribution of Predicted Probabilities')
plt.show()


In [None]:
# Summarizing Findings
# This cell summarizes the key findings from the analysis and presents them in a concise format suitable for a CEO or executive summary.

def summarize_findings(accuracy, report, feature_importances, feature_names):
    summary = f"""
    ## Executive Summary: Thyroid Cancer Recurrence Analysis

    ### Overview
    This analysis utilized the CatBoost machine learning model to identify key factors influencing thyroid cancer recurrence. The model was trained and evaluated using clinicopathologic features from a comprehensive dataset.

    ### Key Findings
    - **Model Accuracy**: The CatBoost model achieved an accuracy of {accuracy:.2f} on the test set.
    - **Model Performance**: The detailed classification report is as follows:
    {report}
    - **Feature Importance**: The most influential features identified by the model are listed below (in order of importance):
    """
    for name, importance in sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=True):
        summary += f"\n      - {name}: {importance:.2f}"

    summary += """

    ### Visualizations
    The following visualizations were generated to provide further insights:
    - **Confusion Matrix**: Displayed the performance of the model in predicting recurrence.
    - **Distribution of Predicted Probabilities**: Showed the confidence level of the model's predictions.

    ### Recommendations
    - **Focus on Key Features**: Medical professionals should pay particular attention to the most influential features identified by the model, such as [Top Features].
    - **Further Research**: Continued research and model refinement can help improve predictive accuracy and identify additional factors influencing thyroid cancer recurrence.

    ### Conclusion
    The CatBoost model provides valuable insights into the factors influencing thyroid cancer recurrence. These findings can guide clinical decisions and help prioritize areas for further research and intervention.
    """
    return summary

# Generate the summary
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
feature_importances = model.get_feature_importance(Pool(X_train, y_train))
feature_names = X_train.columns

summary = summarize_findings(accuracy, report, feature_importances, feature_names)
print(summary)


In [None]:
# Generating Detailed Feature Importance Report
# This cell generates a comprehensive report on feature importance with visualizations.

def generate_feature_importance_report(model, X_train):
    # Get feature importances
    feature_importances = model.get_feature_importance(Pool(X_train, y_train))
    feature_names = X_train.columns

    # Create a DataFrame for feature importances
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)

    # Display the DataFrame
    display(importance_df)

    # Plot feature importance
    plt.figure(figsize=(12, 10))
    plt.barh(importance_df['Feature'], importance_df['Importance'])
    plt.xlabel("Feature Importance")
    plt.title("Detailed Feature Importance Analysis")
    plt.gca().invert_yaxis()  # Invert y-axis to show the most important feature at the top
    plt.show()

    return importance_df

# Generate the feature importance report
importance_df = generate_feature_importance_report(model, X_train)

# Display the summary
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

summary = summarize_findings(accuracy, report, importance_df['Importance'], importance_df['Feature'])
print(summary)


In [None]:
# Install necessary library
!pip install fpdf

# Import necessary libraries
from fpdf import FPDF
import matplotlib.pyplot as plt

# Function to create a PDF report
class PDF(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, 'Thyroid Cancer Recurrence Analysis Report', 0, 1, 'C')
        self.ln(10)

    def chapter_title(self, title):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, title, 0, 1, 'L')
        self.ln(5)

    def chapter_body(self, body):
        self.set_font('Arial', '', 12)
        self.multi_cell(0, 10, body)
        self.ln()

    def add_chart(self, image_path):
        self.image(image_path, x=None, y=None, w=190)
        self.ln()

# Create PDF document
pdf = PDF()

# Add Title Page
pdf.add_page()
pdf.set_font('Arial', 'B', 16)
pdf.cell(0, 10, 'Thyroid Cancer Recurrence Analysis', 0, 1, 'C')
pdf.ln(10)
pdf.set_font('Arial', 'I', 12)
pdf.cell(0, 10, 'Executive Summary', 0, 1, 'C')
pdf.ln(20)

# Add Executive Summary
pdf.add_page()
pdf.chapter_title('Executive Summary')
summary_body = (
    f"Overview:\n"
    f"This analysis utilized the CatBoost machine learning model to identify key factors influencing thyroid cancer recurrence.\n\n"
    f"Key Findings:\n"
    f"- Model Accuracy: {accuracy:.2f}\n"
    f"- Feature Importance:\n"
)
for _, row in importance_df.iterrows():
    summary_body += f"  - {row['Feature']}: {row['Importance']:.2f}\n"

summary_body += (
    "\n"
    "Recommendations:\n"
    "- Focus on Key Factors: Efforts should be directed towards understanding and monitoring the most influential factors identified by the analysis.\n"
    "- Resource Allocation: Allocate resources to areas with the highest impact on recurrence prediction and patient outcomes.\n"
    "- Further Research: Encourage further research to refine predictive models and uncover additional factors influencing recurrence.\n\n"
    "Conclusion:\n"
    "The insights gained from this analysis provide a foundation for data-driven decision-making, aimed at improving patient outcomes and optimizing resource utilization.\n"
)
pdf.chapter_body(summary_body)

# Save feature importance chart
plt.figure(figsize=(12, 10))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.xlabel("Feature Importance")
plt.title("Detailed Feature Importance Analysis")
plt.gca().invert_yaxis()
chart_path = '/content/feature_importance.png'
plt.savefig(chart_path)
plt.close()

# Add Feature Importance Chart
pdf.add_page()
pdf.chapter_title('Feature Importance Chart')
pdf.add_chart(chart_path)

# Save PDF
pdf_output_path = '/content/Thyroid_Cancer_Recurrence_Analysis_Report.pdf'
pdf.output(pdf_output_path)

print(f"PDF report has been generated and saved to {pdf_output_path}")


In [None]:
# Install necessary library
!pip install fpdf

# Import necessary libraries
from fpdf import FPDF
import matplotlib.pyplot as plt
import pandas as pd

# Function to create a PDF report
class PDF(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, 'Thyroid Cancer Recurrence Analysis Report', 0, 1, 'C')
        self.ln(10)

    def chapter_title(self, title):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, title, 0, 1, 'L')
        self.ln(5)

    def chapter_body(self, body):
        self.set_font('Arial', '', 12)
        self.multi_cell(0, 10, body)
        self.ln()

    def add_chart(self, image_path):
        self.image(image_path, x=None, y=None, w=190)
        self.ln()

# Create PDF document
pdf = PDF()

# Add Title Page
pdf.add_page()
pdf.set_font('Arial', 'B', 16)
pdf.cell(0, 10, 'Thyroid Cancer Recurrence Analysis', 0, 1, 'C')
pdf.ln(10)
pdf.set_font('Arial', 'I', 12)
pdf.cell(0, 10, 'Executive Summary', 0, 1, 'C')
pdf.ln(20)

# Add Executive Summary
pdf.add_page()
pdf.chapter_title('Executive Summary')
summary_body = (
    "Overview:\n"
    "This analysis utilized the CatBoost machine learning model to identify key factors influencing thyroid cancer recurrence.\n\n"
    "Key Findings:\n"
    "- Model Accuracy: {:.2f}\n".format(accuracy) +
    "- Feature Importance:\n"
)

feature_explanations = {
    "Response_Structural Incomplete": (
        "Refers to the presence of structural abnormalities post-treatment.",
        "Patients with this response may have a higher recurrence risk.",
        "Intensify follow-up protocols for these patients."
    ),
    "Age": (
        "Indicates the patient's age.",
        "Different age groups may have varying risks.",
        "Tailor treatment plans based on age."
    ),
    "Thyroid Function": (
        "Status of the patient's thyroid function.",
        "Abnormal function may increase recurrence risk.",
        "Provide specialized care for patients with abnormal thyroid function."
    ),
    # Add more feature explanations as needed
}

for _, row in importance_df.iterrows():
    feature = row['Feature']
    importance = row['Importance']
    description, relevance, recommendation = feature_explanations.get(feature, ("No description available.", "No clinical relevance available.", "No recommendation available."))
    summary_body += (
        f"  - {feature} ({importance:.2f}):\n"
        f"    - Description: {description}\n"
        f"    - Clinical Relevance: {relevance}\n"
        f"    - Recommendation: {recommendation}\n"
    )

summary_body += (
    "\nRecommendations:\n"
    "- Focus on Key Factors: Efforts should be directed towards understanding and monitoring the most influential factors identified by the analysis.\n"
    "- Resource Allocation: Allocate resources to areas with the highest impact on recurrence prediction and patient outcomes.\n"
    "- Further Research: Encourage further research to refine predictive models and uncover additional factors influencing recurrence.\n\n"
    "Conclusion:\n"
    "The insights gained from this analysis provide a foundation for data-driven decision-making, aimed at improving patient outcomes and optimizing resource utilization.\n"
)
pdf.chapter_body(summary_body)

# Save feature importance chart
plt.figure(figsize=(12, 10))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.xlabel("Feature Importance")
plt.title("Detailed Feature Importance Analysis")
plt.gca().invert_yaxis()
chart_path = '/content/feature_importance.png'
plt.savefig(chart_path)
plt.close()

# Add Feature Importance Chart
pdf.add_page()
pdf.chapter_title('Feature Importance Chart')
pdf.add_chart(chart_path)

# Save PDF
pdf_output_path = '/content/Thyroid_Cancer_Recurrence_Dataset_Review.pdf'
pdf.output(pdf_output_path)

print(f"PDF report has been generated and saved to {pdf_output_path}")
