In [None]:
# Thyroid Cancer Recurrence Prediction System
# Clean, Production-Ready Code - No Data Leakage

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("THYROID CANCER RECURRENCE PREDICTION SYSTEM")
print("="*70)

In [None]:
# ========================
# 1. LOAD DATA
# ========================
from google.colab import files
print("\n Upload your thyroid cancer dataset (CSV):")
uploaded = files.upload()

df = pd.read_csv(list(uploaded.keys())[0])
print(f"\n Loaded: {df.shape[0]} samples, {df.shape[1]} features")


In [None]:
# ========================
# 2. REMOVE DATA LEAKAGE
# ========================
# These features are measured AFTER/DURING recurrence detection
LEAKY_FEATURES = ['Response', 'Stage', 'T', 'N', 'M']

print(f"\n  Removing leaky features: {[f for f in LEAKY_FEATURES if f in df.columns]}")
df_clean = df.drop(columns=[f for f in LEAKY_FEATURES if f in df.columns])


In [None]:
# ========================
# 3. PREPROCESS
# ========================
# Handle missing values
df_clean = df_clean.fillna(df_clean.mode().iloc[0])

# Encode categorical variables
le = LabelEncoder()
for col in df_clean.select_dtypes(include=['object']).columns:
    df_clean[col] = le.fit_transform(df_clean[col].astype(str))

# Split features and target
X = df_clean.drop('Recurred', axis=1)
y = df_clean['Recurred']

print(f" Clean dataset: {X.shape[1]} features")
print(f"   Recurrence rate: {y.mean()*100:.1f}%")

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# ========================
# 4. TRAIN MODELS
# ========================
print("\n" + "="*70)
print("MODEL TRAINING")
print("="*70)

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42, C=0.1),
    'Random Forest': RandomForestClassifier(n_estimators=200, max_depth=8,
                                           min_samples_split=10, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=200, max_depth=4,
                                                    learning_rate=0.05, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=200, max_depth=4, learning_rate=0.05,
                            random_state=42, n_jobs=-1, eval_metric='logloss')
}

results = {}
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    print(f"\n Training {name}...")

    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

    auc = roc_auc_score(y_test, y_pred_proba)
    acc = accuracy_score(y_test, y_pred)
    cv_scores = cross_val_score(model, X_train_scaled, y_train,
                               cv=skf, scoring='roc_auc', n_jobs=-1)

    results[name] = {
        'model': model,
        'y_pred': y_pred,
        'y_proba': y_pred_proba,
        'auc': auc,
        'accuracy': acc,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std()
    }

    print(f"   Test AUC: {auc:.4f} | Accuracy: {acc:.4f}")
    print(f"   CV AUC: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")


In [None]:
# ========================
# 5. ENSEMBLE MODEL
# ========================
print("\n Creating Ensemble (Voting Classifier)...")

ensemble = VotingClassifier(
    estimators=[
        ('rf', models['Random Forest']),
        ('gb', models['Gradient Boosting']),
        ('xgb', models['XGBoost'])
    ],
    voting='soft',
    n_jobs=-1
)

ensemble.fit(X_train_scaled, y_train)
y_pred_ensemble = ensemble.predict(X_test_scaled)
y_proba_ensemble = ensemble.predict_proba(X_test_scaled)[:, 1]

auc_ensemble = roc_auc_score(y_test, y_proba_ensemble)
acc_ensemble = accuracy_score(y_test, y_pred_ensemble)
cv_ensemble = cross_val_score(ensemble, X_train_scaled, y_train,
                              cv=skf, scoring='roc_auc', n_jobs=-1)

results['Ensemble'] = {
    'model': ensemble,
    'y_pred': y_pred_ensemble,
    'y_proba': y_proba_ensemble,
    'auc': auc_ensemble,
    'accuracy': acc_ensemble,
    'cv_mean': cv_ensemble.mean(),
    'cv_std': cv_ensemble.std()
}

print(f"   Test AUC: {auc_ensemble:.4f} | Accuracy: {acc_ensemble:.4f}")
print(f"   CV AUC: {cv_ensemble.mean():.4f} (±{cv_ensemble.std():.4f})")

# Select best model
best_name = max(results, key=lambda x: results[x]['auc'])
best_model = results[best_name]

print(f"\n BEST MODEL: {best_name}")
print(f"   AUC: {best_model['auc']:.4f}")


In [None]:
# ========================
# 6. EVALUATION
# ========================
print("\n" + "="*70)
print("CLASSIFICATION REPORT")
print("="*70)
print(classification_report(y_test, best_model['y_pred'],
                          target_names=['No Recurrence', 'Recurrence']))


In [None]:
# ========================
# 7. VISUALIZATIONS
# ========================
print("\n" + "="*70)
print("GENERATING VISUALIZATIONS")
print("="*70)

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Model Comparison
ax1 = axes[0, 0]
model_names = list(results.keys())
auc_scores = [results[m]['auc'] for m in model_names]
colors = ['#27ae60' if m == best_name else '#3498db' for m in model_names]
bars = ax1.barh(model_names, auc_scores, color=colors, alpha=0.8)
ax1.set_xlabel('AUC Score', fontsize=12, fontweight='bold')
ax1.set_title('Model Performance Comparison', fontsize=14, fontweight='bold')
ax1.set_xlim([0.5, 1.0])
ax1.grid(axis='x', alpha=0.3)
for i, (bar, score) in enumerate(zip(bars, auc_scores)):
    ax1.text(score + 0.01, i, f'{score:.4f}', va='center', fontweight='bold')

# 2. Confusion Matrix
ax2 = axes[0, 1]
cm = confusion_matrix(y_test, best_model['y_pred'])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax2,
           cbar_kws={'label': 'Count'}, annot_kws={'size': 16, 'weight': 'bold'})
ax2.set_title(f'Confusion Matrix - {best_name}', fontsize=14, fontweight='bold')
ax2.set_ylabel('Actual', fontsize=12, fontweight='bold')
ax2.set_xlabel('Predicted', fontsize=12, fontweight='bold')
ax2.set_xticklabels(['No Recurrence', 'Recurrence'])
ax2.set_yticklabels(['No Recurrence', 'Recurrence'])

# 3. ROC Curves
ax3 = axes[1, 0]
for name in model_names:
    fpr, tpr, _ = roc_curve(y_test, results[name]['y_proba'])
    auc = results[name]['auc']
    lw = 3 if name == best_name else 1.5
    ls = '-' if name == best_name else '--'
    ax3.plot(fpr, tpr, label=f'{name} (AUC={auc:.3f})',
            linewidth=lw, linestyle=ls)
ax3.plot([0, 1], [0, 1], 'k--', linewidth=1, alpha=0.5)
ax3.set_xlabel('False Positive Rate', fontsize=12, fontweight='bold')
ax3.set_ylabel('True Positive Rate', fontsize=12, fontweight='bold')
ax3.set_title('ROC Curves', fontsize=14, fontweight='bold')
ax3.legend(loc='lower right', fontsize=9)
ax3.grid(alpha=0.3)

# 4. Feature Importance
ax4 = axes[1, 1]
if best_name != 'Ensemble':
    base_model = best_model['model']
else:
    base_model = best_model['model'].estimators_[1]  # Use GB from ensemble

if hasattr(base_model, 'feature_importances_'):
    importances = base_model.feature_importances_
    indices = np.argsort(importances)[-10:]
    colors_fi = plt.cm.viridis(importances[indices] / importances[indices].max())
    ax4.barh(range(len(indices)), importances[indices], color=colors_fi)
    ax4.set_yticks(range(len(indices)))
    ax4.set_yticklabels([X.columns[i] for i in indices])
    ax4.set_xlabel('Importance', fontsize=12, fontweight='bold')
    ax4.set_title(f'Top 10 Features - {best_name}', fontsize=14, fontweight='bold')
    ax4.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# ========================
# 8. SUMMARY
# ========================
print("\n" + "="*70)
print("FINAL SUMMARY")
print("="*70)

print(f"\n Dataset:")
print(f"   Samples: {len(df_clean)} | Features: {X.shape[1]}")
print(f"   Recurrence Rate: {y.mean()*100:.1f}%")
print(f"   Train/Test: {len(y_train)}/{len(y_test)}")

print(f"\n Best Model: {best_name}")
print(f"   Test AUC: {best_model['auc']:.4f}")
print(f"   Test Accuracy: {best_model['accuracy']:.4f}")
print(f"   CV AUC: {best_model['cv_mean']:.4f} (±{best_model['cv_std']:.4f})")

print(f"\n💡 Top Predictive Features:")
if hasattr(base_model, 'feature_importances_'):
    top_idx = np.argsort(base_model.feature_importances_)[-5:][::-1]
    for i, idx in enumerate(top_idx, 1):
        feat_name = X.columns[idx]
        importance = base_model.feature_importances_[idx]
        print(f"   {i}. {feat_name}: {importance:.4f}")

print("\n" + "="*70)
print("="*70)