In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer


In [None]:
data = pd.read_csv('thyroid_cancer_risk_data.csv')
data.drop('Patient_ID', axis=1, inplace=True) 

In [None]:

binary_cols = ['Gender', 'Family_History', 'Radiation_Exposure', 
               'Iodine_Deficiency', 'Smoking', 'Obesity', 'Diabetes']
data[binary_cols] = data[binary_cols].replace({'Male': 1, 'Female': 0, 'Yes': 1, 'No': 0})

risk_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
data['Thyroid_Cancer_Risk'] = data['Thyroid_Cancer_Risk'].map(risk_mapping)

categorical_cols = ['Country', 'Ethnicity']
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)],
    remainder='passthrough'
)
X = preprocessor.fit_transform(data.drop('Diagnosis', axis=1))
y = LabelEncoder().fit_transform(data['Diagnosis'])  # Benign=0, Malignant=1

In [None]:
from imblearn.over_sampling import SMOTE

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Decision Tree
dt_params = {'max_depth': [3, 5, 7], 'min_samples_split': [2, 5]}
dt = GridSearchCV(DecisionTreeClassifier(), dt_params, cv=5, scoring='recall')
dt.fit(X_train_smote, y_train_smote)

# Random Forest
rf_params = {'n_estimators': [100, 200], 'max_depth': [5, 7]}
rf = GridSearchCV(RandomForestClassifier(class_weight='balanced'), rf_params, cv=5, scoring='recall')
rf.fit(X_train_smote, y_train_smote)

# Gradient Boosting
gb_params = {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 5]}
gb = GridSearchCV(GradientBoostingClassifier(), gb_params, cv=5, scoring='recall')
gb.fit(X_train_smote, y_train_smote)

# XGBoost
xgb_params = {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 5]}
xgb = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), xgb_params, cv=5, scoring='recall')
xgb.fit(X_train_smote, y_train_smote)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred),
        'AUC-ROC': roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    }
    return metrics

dt_metrics = evaluate_model(dt.best_estimator_, X_test, y_test)
rf_metrics = evaluate_model(rf.best_estimator_, X_test, y_test)
gb_metrics = evaluate_model(gb.best_estimator_, X_test, y_test)
xgb_metrics = evaluate_model(xgb.best_estimator_, X_test, y_test)

In [None]:
results = pd.DataFrame([dt_metrics, rf_metrics, gb_metrics, xgb_metrics], 
                       index=['Decision Tree', 'Random Forest', 'Gradient Boosting', 'XGBoost'])
print(results)