In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_class_weight

In [8]:
data_path = "/content/thyroid_dataset.csv"
df = pd.read_csv(data_path)

print("Loaded:", data_path)
print("Shape:", df.shape)
df.head()

Loaded: /content/thyroid_dataset.csv
Shape: (383, 17)


Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
0,27,F,No,No,No,Euthyroid,Single nodular goiter-left,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Indeterminate,No
1,34,F,No,Yes,No,Euthyroid,Multinodular goiter,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
2,30,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
3,62,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
4,62,F,No,No,No,Euthyroid,Multinodular goiter,No,Micropapillary,Multi-Focal,Low,T1a,N0,M0,I,Excellent,No


In [9]:
print("Columns:", list(df.columns))
print("Missing per column:", df.isna().sum())
print("Target value counts (Recurred):", df['Recurred'].value_counts(dropna=False))
print("Unique:", df['Recurred'].unique())

Columns: ['Age', 'Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy', 'Thyroid Function', 'Physical Examination', 'Adenopathy', 'Pathology', 'Focality', 'Risk', 'T', 'N', 'M', 'Stage', 'Response', 'Recurred']
Missing per column: Age                     0
Gender                  0
Smoking                 0
Hx Smoking              0
Hx Radiothreapy         0
Thyroid Function        0
Physical Examination    0
Adenopathy              0
Pathology               0
Focality                0
Risk                    0
T                       0
N                       0
M                       0
Stage                   0
Response                0
Recurred                0
dtype: int64
Target value counts (Recurred): Recurred
No     275
Yes    108
Name: count, dtype: int64
Unique: ['No' 'Yes']


In [10]:
if df['Recurred'].dtype == object:
    df['Recurred'] = df['Recurred'].map(
        lambda x: 1 if str(x).lower() in ['yes', 'y', '1', 'true'] else 0
    )

df = df.dropna(subset=['Recurred'])



In [11]:
numeric_cols = []
categorical_cols = []

for col in df.columns:
    if col == 'Recurred':
        continue
    if pd.api.types.is_numeric_dtype(df[col]):
        numeric_cols.append(col)
    else:
        categorical_cols.append(col)

print("Numeric:", numeric_cols)
print("Categorical:", categorical_cols)

Numeric: ['Age']
Categorical: ['Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy', 'Thyroid Function', 'Physical Examination', 'Adenopathy', 'Pathology', 'Focality', 'Risk', 'T', 'N', 'M', 'Stage', 'Response']


In [12]:
X = df.drop('Recurred', axis=1)
y = df['Recurred']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train/Test:", X_train.shape, X_test.shape)

Train/Test: (306, 16) (77, 16)


In [15]:
numeric_t = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_t = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_t, numeric_cols),
        ('cat', categorical_t, categorical_cols)
    ]
)

preprocessor.fit(X_train)
X_train_p = preprocessor.transform(X_train)
X_test_p = preprocessor.transform(X_test)

In [16]:
classes = np.unique(y_train)
weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weights = {int(classes[i]): weights[i] for i in range(len(classes))}

print("Class Weights:", class_weights)

Class Weights: {0: np.float64(0.6954545454545454), 1: np.float64(1.7790697674418605)}


In [18]:
lr = LogisticRegression(class_weight=class_weights, max_iter=1000)
lr.fit(X_train_p, y_train)

y_pred_lr = lr.predict(X_test_p)
y_proba_lr = lr.predict_proba(X_test_p)[:, 1]
print("Logistic Regression Results")
print(classification_report(y_test, y_pred_lr))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_lr))

Logistic Regression Results
              precision    recall  f1-score   support

           0       0.96      0.98      0.97        55
           1       0.95      0.91      0.93        22

    accuracy                           0.96        77
   macro avg       0.96      0.95      0.95        77
weighted avg       0.96      0.96      0.96        77

ROC-AUC: 0.9917355371900826


In [19]:
xgb = XGBClassifier(
    max_depth=6,
    n_estimators=300,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss',
    scale_pos_weight=class_weights[1]
)

xgb.fit(X_train_p, y_train)

y_pred_xgb = xgb.predict(X_test_p)
y_proba_xgb = xgb.predict_proba(X_test_p)[:, 1]

print("XGBoost Results")
print(classification_report(y_test, y_pred_xgb))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_xgb))
print("Confusion Matrix:", confusion_matrix(y_test, y_pred_xgb))

XGBoost Results
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        55
           1       0.95      0.95      0.95        22

    accuracy                           0.97        77
   macro avg       0.97      0.97      0.97        77
weighted avg       0.97      0.97      0.97        77

ROC-AUC: 0.9950413223140495
Confusion Matrix: [[54  1]
 [ 1 21]]


In [20]:
joblib.dump(preprocessor, "preprocessor.joblib")
joblib.dump(xgb, "thyroid_recurrence_xgb_model.joblib")

print("Saved preprocessor and model successfully!")

Saved preprocessor and model successfully!
