In [19]:
import numpy as np
import pandas as pd
import json
import pickle


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE

In [21]:
data = pd.read_csv('thyroid_cancer_risk_data.csv')
data.drop('Patient_ID', axis=1, inplace=True)

In [22]:
data.head()

Unnamed: 0,Age,Gender,Country,Ethnicity,Family_History,Radiation_Exposure,Iodine_Deficiency,Smoking,Obesity,Diabetes,TSH_Level,T3_Level,T4_Level,Nodule_Size,Thyroid_Cancer_Risk,Diagnosis
0,66,Male,Russia,Caucasian,No,Yes,No,No,No,No,9.37,1.67,6.16,1.08,Low,Benign
1,29,Male,Germany,Hispanic,No,Yes,No,No,No,No,1.83,1.73,10.54,4.05,Low,Benign
2,86,Male,Nigeria,Caucasian,No,No,No,No,No,No,6.26,2.59,10.57,4.61,Low,Benign
3,75,Female,India,Asian,No,No,No,No,No,No,4.1,2.62,11.04,2.46,Medium,Benign
4,35,Female,Germany,African,Yes,Yes,No,No,No,No,9.1,2.11,10.71,2.11,High,Benign


In [23]:
binary_cols = ['Gender', 'Family_History', 'Radiation_Exposure', 
               'Iodine_Deficiency', 'Smoking', 'Obesity', 'Diabetes']
data[binary_cols] = data[binary_cols].replace({'Male': 1, 'Female': 0, 'Yes': 1, 'No': 0})

risk_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
data['Thyroid_Cancer_Risk'] = data['Thyroid_Cancer_Risk'].map(risk_mapping)

X = data.drop('Diagnosis', axis=1)
y = data['Diagnosis'].map({'Benign': 0, 'Malignant': 1})  # Explicit target mapping


In [24]:
data.tail()

Unnamed: 0,Age,Gender,Country,Ethnicity,Family_History,Radiation_Exposure,Iodine_Deficiency,Smoking,Obesity,Diabetes,TSH_Level,T3_Level,T4_Level,Nodule_Size,Thyroid_Cancer_Risk,Diagnosis
212686,58,0,India,Asian,0,0,0,0,1,0,2.0,0.64,11.92,1.48,0,Benign
212687,89,1,Japan,Middle Eastern,0,0,0,0,1,0,9.77,3.25,7.3,4.46,1,Benign
212688,72,0,Nigeria,Hispanic,0,0,0,0,0,1,7.72,2.44,8.71,2.36,1,Benign
212689,85,0,Brazil,Middle Eastern,0,0,0,0,0,1,5.62,2.53,9.62,1.54,1,Benign
212690,46,0,Japan,Middle Eastern,0,0,0,1,0,0,5.6,2.73,10.59,2.53,0,Malignant


In [25]:
print(y.isnull().sum())  # Check how many NaNs are in y


0


In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

In [27]:
categorical_cols = ['Country', 'Ethnicity']
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)],
    remainder='passthrough'
)

In [28]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)


In [29]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_processed, y_train)


In [30]:
with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

In [31]:
def save_model_and_params(model, model_name):
    """Save model to pickle and parameters to JSON"""
    # Save model
    with open(f'{model_name}_model.pkl', 'wb') as f:
        pickle.dump(model.best_estimator_, f)
    
    # Save parameters
    params = {
        'best_params': model.best_params_,
        'best_score': float(model.best_score_)  # Convert numpy float to Python float
    }
    with open(f'{model_name}_params.json', 'w') as f:
        json.dump(params, f, indent=2)


In [35]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

dt_params = {'max_depth': [3, 5, 7], 'min_samples_split': [2, 5]}
dt = GridSearchCV(DecisionTreeClassifier(), dt_params, 
                cv=5, scoring='recall', verbose=3, n_jobs=-1)
dt.fit(X_train_smote, y_train_smote)
save_model_and_params(dt, 'decision_tree')


Fitting 2 folds for each of 6 candidates, totalling 12 fits


In [36]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

rf_params = {'n_estimators': [100, 200], 'max_depth': [5, 7]}
rf = GridSearchCV(RandomForestClassifier(), rf_params,
                cv=5, scoring='recall', verbose=3, n_jobs=-1)
rf.fit(X_train_smote, y_train_smote)
save_model_and_params(rf, 'random_forest')


Fitting 2 folds for each of 4 candidates, totalling 8 fits


In [37]:
# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier

gb_params = {'n_estimators': [100, 200], 
             'learning_rate': [0.01, 0.1], 
             'max_depth': [3, 5]}
gb = GridSearchCV(GradientBoostingClassifier(), gb_params,
                cv=5, scoring='recall', verbose=3, n_jobs=-1)
gb.fit(X_train_smote, y_train_smote)
save_model_and_params(gb, 'gradient_boosting')


Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV 3/5] END ..max_depth=3, min_samples_split=2;, score=0.652 total time=   2.1s
[CV 4/5] END ..max_depth=5, min_samples_split=2;, score=0.748 total time=   2.4s
[CV 5/5] END ..max_depth=5, min_samples_split=5;, score=0.743 total time=   2.2s
[CV 2/5] END ..max_depth=7, min_samples_split=5;, score=0.623 total time=   3.0s
[CV 2/2] END ..max_depth=5, min_samples_split=2;, score=0.723 total time=   1.5s
[CV 2/2] END .....max_depth=5, n_estimators=200;, score=0.779 total time=  43.0s
[CV 2/2] END learning_rate=0.01, max_depth=5, n_estimators=200;, score=0.945 total time= 5.2min
[CV 1/5] END ..max_depth=3, min_samples_split=5;, score=0.449 total time=   2.4s
[CV 1/5] END ..max_depth=5, min_samples_split=5;, score=0.438 total time=   2.6s
[CV 4/5] END ..max_depth=7, min_samples_split=2;, score=0.820 total time=   2.9s
[CV 1/2] END ..max_depth=3, min_samples_split=2;, score=0.529 total time=   1.3s
[CV 1/2] END ..max_depth=7, min_sa

In [37]:
# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier

gb_params = {'n_estimators': [100, 200], 
             'learning_rate': [0.01, 0.1], 
             'max_depth': [3, 5]}
gb = GridSearchCV(GradientBoostingClassifier(), gb_params,
                cv=5, scoring='recall', verbose=3, n_jobs=-1)
gb.fit(X_train_smote, y_train_smote)
save_model_and_params(gb, 'gradient_boosting')


Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV 3/5] END ..max_depth=3, min_samples_split=2;, score=0.652 total time=   2.1s
[CV 4/5] END ..max_depth=5, min_samples_split=2;, score=0.748 total time=   2.4s
[CV 5/5] END ..max_depth=5, min_samples_split=5;, score=0.743 total time=   2.2s
[CV 2/5] END ..max_depth=7, min_samples_split=5;, score=0.623 total time=   3.0s
[CV 2/2] END ..max_depth=5, min_samples_split=2;, score=0.723 total time=   1.5s
[CV 2/2] END .....max_depth=5, n_estimators=200;, score=0.779 total time=  43.0s
[CV 2/2] END learning_rate=0.01, max_depth=5, n_estimators=200;, score=0.945 total time= 5.2min
[CV 1/5] END ..max_depth=3, min_samples_split=5;, score=0.449 total time=   2.4s
[CV 1/5] END ..max_depth=5, min_samples_split=5;, score=0.438 total time=   2.6s
[CV 4/5] END ..max_depth=7, min_samples_split=2;, score=0.820 total time=   2.9s
[CV 1/2] END ..max_depth=3, min_samples_split=2;, score=0.529 total time=   1.3s
[CV 1/2] END ..max_depth=7, min_sa

In [46]:
# XGBoost
from xgboost import XGBClassifier

xgb_params = {'n_estimators': [100, 200],
              'learning_rate': [0.01, 0.1],
              'max_depth': [3, 5]}
xgb = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
                 xgb_params, cv=5, scoring='recall', verbose=3, n_jobs=-1)
xgb.fit(X_train_smote, y_train_smote)
save_model_and_params(xgb, 'xgboost')


XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <FBD6AEF9-AFAB-39D7-B881-755157DA0497> /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/usr/local/opt/libomp/lib/libomp.dylib' (no such file)"]


In [47]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    return {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred),
        'AUC-ROC': roc_auc_score(y_test, y_proba)
    }

In [49]:
models = {
    'Decision Tree': pickle.load(open('decision_tree_model.pkl', 'rb')),
    'Random Forest': pickle.load(open('random_forest_model.pkl', 'rb')),
    'Gradient Boosting': pickle.load(open('gradient_boosting_model.pkl', 'rb')),
    # 'XGBoost': pickle.load(open('xgboost_model.pkl', 'rb'))
}

results = {}
for name, model in models.items():
    results[name] = evaluate_model(model, X_test_processed, y_test)

results_df = pd.DataFrame(results).T
print("\nModel Performance Comparison:")
print(results_df)


Model Performance Comparison:
                   Accuracy  Precision    Recall  F1-Score   AUC-ROC
Decision Tree      0.828454   0.704025  0.453499  0.551651  0.696474
Random Forest      0.828595   0.704731  0.453431  0.551817  0.697837
Gradient Boosting  0.828595   0.704731  0.453431  0.551817  0.700439


In [50]:
def predict_example(sample_data):
    """Example prediction function"""
    processed_data = preprocessor.transform(sample_data)
    predictions = {}
    for name, model in models.items():
        proba = model.predict_proba(processed_data)[0][1]
        predictions[name] = {
            'prediction': 'Malignant' if proba >= 0.5 else 'Benign',
            'confidence': round(proba, 3)
        }
    return predictions

# Test prediction
sample = X_test.sample(1, random_state=42)
print("\nSample Prediction:")
print(predict_example(sample))



Sample Prediction:
{'Decision Tree': {'prediction': 'Benign', 'confidence': 0.058}, 'Random Forest': {'prediction': 'Benign', 'confidence': 0.263}, 'Gradient Boosting': {'prediction': 'Benign', 'confidence': 0.241}}
