### Import required libraries

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib

### Load Dataset

In [2]:
df = pd.read_csv('dataset.csv')

In [7]:
df.head()

Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
0,27,F,No,No,No,Euthyroid,Single nodular goiter-left,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Indeterminate,No
1,34,F,No,Yes,No,Euthyroid,Multinodular goiter,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
2,30,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
3,62,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
4,62,F,No,No,No,Euthyroid,Multinodular goiter,No,Micropapillary,Multi-Focal,Low,T1a,N0,M0,I,Excellent,No


### Display dataset info

In [3]:
print(f"Dataset shape: {df.shape}")
print("\nColumns:", df.columns.tolist())
print("\nMissing values:\n", df.isnull().sum())

Dataset shape: (383, 17)

Columns: ['Age', 'Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy', 'Thyroid Function', 'Physical Examination', 'Adenopathy', 'Pathology', 'Focality', 'Risk', 'T', 'N', 'M', 'Stage', 'Response', 'Recurred']

Missing values:
 Age                     0
Gender                  0
Smoking                 0
Hx Smoking              0
Hx Radiothreapy         0
Thyroid Function        0
Physical Examination    0
Adenopathy              0
Pathology               0
Focality                0
Risk                    0
T                       0
N                       0
M                       0
Stage                   0
Response                0
Recurred                0
dtype: int64


### Handle missing values

In [4]:
for col in df.columns:
    if df[col].isnull().any():
        if df[col].dtype == 'object':
            df[col] = df[col].fillna(df[col].mode()[0])
        else:
            df[col] = df[col].fillna(df[col].median())

### Print exact column names for verification

In [15]:
print("Actual columns in dataset:")
print([col for col in df.columns])

Actual columns in dataset:
['Age', 'Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy', 'Thyroid Function', 'Physical Examination', 'Adenopathy', 'Pathology', 'Focality', 'Risk', 'T', 'N', 'M', 'Stage', 'Response', 'Recurred']


### Check for required columns

In [16]:
required_columns = ['Hx Radiotherapy', 'Recurred'] + categorical_features + numerical_features
missing_columns = [col for col in required_columns if col not in df.columns]

In [17]:

if missing_columns:
    print(f"\n⚠️ MISSING COLUMNS: {missing_columns}")
    print("Possible fixes:")
    print("1. Check column names in your CSV file")
    print("2. Try alternative names (case-insensitive search):")
    
    # Try to find similar columns
    for col in missing_columns:
        matches = [c for c in df.columns if col.lower() in c.lower()]
        print(f"   - For '{col}': found similar {matches}")
    
    # Suggest renaming
    print("\n3. Rename columns in code if necessary:")
    print("   categorical_features = [actual_column_name, ...]")
else:
    print("\n✅ All required columns present")


⚠️ MISSING COLUMNS: ['Hx Radiotherapy', 'Hx Radiotherapy', 'I']
Possible fixes:
1. Check column names in your CSV file
2. Try alternative names (case-insensitive search):
   - For 'Hx Radiotherapy': found similar []
   - For 'Hx Radiotherapy': found similar []
   - For 'I': found similar ['Smoking', 'Hx Smoking', 'Hx Radiothreapy', 'Thyroid Function', 'Physical Examination', 'Focality', 'Risk']

3. Rename columns in code if necessary:
   categorical_features = [actual_column_name, ...]


### Fix column name typos

In [24]:
df = df.rename(columns={
    'Hx Radiothreapy': 'Hx Radiotherapy',  # Fix spelling
})

In [26]:
print("Corrected columns:", df.columns.tolist())

Corrected columns: ['Age', 'Gender', 'Smoking', 'Hx Smoking', 'Hx Radiotherapy', 'Thyroid Function', 'Physical Examination', 'Adenopathy', 'Pathology', 'Focality', 'Risk', 'T', 'N', 'M', 'Stage', 'Response', 'Recurred']


### Preprocessing setup
#### Separate features and target

In [27]:
X = df.drop('Recurred', axis=1)
y = df['Recurred']

### Identify feature types

In [28]:
categorical_features = ['Gender', 'Smoking', 'Hx Smoking', 'Hx Radiotherapy', 
                        'Thyroid Function', 'Physical Examination', 'Adenopathy',
                        'Pathology', 'Focality', 'Risk', 'T', 'N', 'M', 'Stage', 'Response']

In [29]:
numerical_features = ['Age']

### Create preprocessing pipeline with missing value handling

In [30]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ])

### Create model pipeline with Gradient Boosting 

In [31]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=5,
        random_state=42,
        subsample=0.8
    ))
])


### Split data

In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

### Train model

In [33]:
model.fit(X_train, y_train)

### Evaluate model

In [34]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

In [35]:
print("Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"AUC Score: {roc_auc_score(y_test, y_proba):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Model Evaluation:
Accuracy: 0.9610
AUC Score: 0.9793

Classification Report:
              precision    recall  f1-score   support

          No       0.95      1.00      0.97        55
         Yes       1.00      0.86      0.93        22

    accuracy                           0.96        77
   macro avg       0.97      0.93      0.95        77
weighted avg       0.96      0.96      0.96        77



### Export model for desktop application

In [36]:
joblib.dump(model, 'thyroid_recurrence_model.pkl')
print("Model exported as 'thyroid_recurrence_model.pkl'")

Model exported as 'thyroid_recurrence_model.pkl'


### Create sample input for testing

In [38]:
sample_data = {
    'Age': [45],
    'Gender': ['female'],
    'Smoking': ['no'],
    'Hx Smoking': ['former'],
    'Hx Radiotherapy': ['no'],
    'Thyroid Function': ['normal'],
    'Physical Examination': ['normal'],
    'Adenopathy': ['no'],
    'Pathology': ['papillary'],
    'Focality': ['unifocal'],
    'Risk': ['low'],
    'T': ['T1'],
    'N': ['N0'],
    'M': ['M0'],
    'Stage': ['I'],
    'Response': ['excellent']
}

In [39]:
sample_df = pd.DataFrame(sample_data)
prediction = model.predict(sample_df)[0]
probability = model.predict_proba(sample_df)[0][1]
print(f"\nSample prediction: {'Recurrence' if prediction == 1 else 'No recurrence'}")
print(f"Recurrence probability: {probability:.2%}")


Sample prediction: No recurrence
Recurrence probability: 0.74%
