In [None]:
# Lung Cancer Survival - FAST & ACCURATE
# Random Forest only - 10 mins max, 70%+ accuracy

import subprocess
import sys
print("Installing XGBoost...")
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'xgboost', '-q'])
print("✓ Done\n")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, f1_score
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("LUNG CANCER SURVIVAL - OPTIMIZED FOR SPEED + ACCURACY")
print("="*70)


In [None]:
# ============================================================================
# 1. LOAD DATA
# ============================================================================
print("\n[1] Loading dataset...")
df = pd.read_csv('dataset_med.csv')
print(f"Shape: {df.shape}")
print(f"Survival rate: {df['survived'].mean()*100:.2f}%")


In [None]:
# ============================================================================
# 2. SMART FEATURE ENGINEERING
# ============================================================================
print("\n[2] Feature engineering...")
data = df.copy().drop('id', axis=1)

# Dates
data['diagnosis_date'] = pd.to_datetime(data['diagnosis_date'])
data['end_treatment_date'] = pd.to_datetime(data['end_treatment_date'])
data['treatment_days'] = (data['end_treatment_date'] - data['diagnosis_date']).dt.days
data['diagnosis_year'] = data['diagnosis_date'].dt.year
data['diagnosis_month'] = data['diagnosis_date'].dt.month
data['treatment_year'] = data['end_treatment_date'].dt.year
data = data.drop(['diagnosis_date', 'end_treatment_date'], axis=1)

# Encode categorical
cat_cols = ['gender', 'country', 'cancer_stage', 'family_history', 'smoking_status', 'treatment_type']
for col in cat_cols:
    data[col] = LabelEncoder().fit_transform(data[col].astype(str))

# Fill missing FIRST
data = data.fillna(data.median())

# Key features that matter
data['age_stage'] = data['age'] * data['cancer_stage']
data['comorbidity_count'] = data['hypertension'] + data['asthma'] + data['cirrhosis'] + data['other_cancer']
data['high_risk'] = (data['comorbidity_count'] >= 2).astype(int)
data['elderly'] = (data['age'] >= 65).astype(int)
data['advanced_stage'] = (data['cancer_stage'] >= 2).astype(int)
data['obese'] = (data['bmi'] >= 30).astype(int)
data['high_cholesterol'] = (data['cholesterol_level'] > 240).astype(int)
data['short_treatment'] = (data['treatment_days'] < 180).astype(int)
data['long_treatment'] = (data['treatment_days'] > 540).astype(int)

# Risk score
data['total_risk'] = (
    data['elderly'] + data['advanced_stage'] + data['obese'] +
    data['high_cholesterol'] + data['high_risk'] +
    (data['smoking_status'] > 0).astype(int)
)

# Normalized features
data['age_norm'] = data['age'] / 100
data['bmi_norm'] = data['bmi'] / 50
data['treatment_norm'] = data['treatment_days'] / 1000

# Key interactions
data['age_bmi'] = data['age'] * data['bmi'] / 100
data['stage_treatment_type'] = data['cancer_stage'] * data['treatment_type']

print(f"Total features: {data.shape[1] - 1}")


In [None]:
# ============================================================================
# 3. TRAIN/TEST SPLIT
# ============================================================================
print("\n[3] Splitting data...")
X = data.drop('survived', axis=1)
y = data['survived']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train: {len(X_train):,} | Test: {len(X_test):,}")


In [None]:
# ============================================================================
# 4. TRAIN MODELS (FAST)
# ============================================================================
print("\n[4] Training models...")

models = {
    'Random Forest': RandomForestClassifier(
        n_estimators=200,
        max_depth=20,
        min_samples_split=10,
        min_samples_leaf=5,
        max_features='sqrt',
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ),
    'XGBoost': XGBClassifier(
        n_estimators=150,
        max_depth=8,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]),
        random_state=42,
        tree_method='hist',
        n_jobs=-1,
        eval_metric='logloss'
    )
}

results = {}

for name, model in models.items():
    print(f"\n  • {name}...")
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    results[name] = {
        'model': model,
        'accuracy': accuracy_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_proba),
        'f1': f1_score(y_test, y_pred),
        'pred': y_pred,
        'proba': y_proba
    }

    print(f"    Accuracy:  {results[name]['accuracy']:.4f} ({results[name]['accuracy']*100:.2f}%)")
    print(f"    ROC-AUC:   {results[name]['roc_auc']:.4f}")
    print(f"    F1-Score:  {results[name]['f1']:.4f}")

# Select best by accuracy
best = max(results, key=lambda x: results[x]['accuracy'])
print(f"\n{'='*70}")
print(f"🏆 BEST MODEL: {best}")
print(f"   Accuracy:  {results[best]['accuracy']:.4f} ({results[best]['accuracy']*100:.2f}%)")
print(f"   ROC-AUC:   {results[best]['roc_auc']:.4f}")
print(f"   F1-Score:  {results[best]['f1']:.4f}")
print(f"{'='*70}")


In [None]:
# ============================================================================
# 5. EVALUATION
# ============================================================================
print("\n[5] Evaluation...")

y_pred = results[best]['pred']
y_proba = results[best]['proba']

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Not Survived', 'Survived']))

print("\nPrediction Distribution:")
pred_counts = pd.Series(y_pred).value_counts()
actual_counts = y_test.value_counts()
print(f"  Not Survived - Predicted: {pred_counts.get(0, 0):,} | Actual: {actual_counts[0]:,}")
print(f"  Survived     - Predicted: {pred_counts.get(1, 0):,} | Actual: {actual_counts[1]:,}")


In [None]:
# ============================================================================
# 6. VISUALIZATIONS
# ============================================================================
print("\n[6] Creating visualizations...")

fig = plt.figure(figsize=(15, 5))

# Confusion Matrix
ax1 = plt.subplot(1, 3, 1)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1)
ax1.set_title(f'Confusion Matrix - {best}')
ax1.set_ylabel('Actual')
ax1.set_xlabel('Predicted')

# ROC Curve
ax2 = plt.subplot(1, 3, 2)
fpr, tpr, _ = roc_curve(y_test, y_proba)
ax2.plot(fpr, tpr, 'b-', linewidth=2, label=f'AUC = {results[best]["roc_auc"]:.3f}')
ax2.plot([0, 1], [0, 1], 'r--', linewidth=1, label='Random (0.500)')
ax2.fill_between(fpr, tpr, alpha=0.2)
ax2.set_xlabel('False Positive Rate')
ax2.set_ylabel('True Positive Rate')
ax2.set_title('ROC Curve')
ax2.legend()
ax2.grid(alpha=0.3)

# Feature Importance
ax3 = plt.subplot(1, 3, 3)
feat_imp = pd.DataFrame({
    'feature': X.columns,
    'importance': results[best]['model'].feature_importances_
}).sort_values('importance', ascending=False).head(15)
ax3.barh(range(len(feat_imp)), feat_imp['importance'], color='coral')
ax3.set_yticks(range(len(feat_imp)))
ax3.set_yticklabels(feat_imp['feature'], fontsize=8)
ax3.set_xlabel('Importance')
ax3.set_title('Top 15 Features')
ax3.invert_yaxis()

plt.tight_layout()
plt.show()


In [None]:
# ============================================================================
# 7. SUMMARY
# ============================================================================
print("\n" + "="*70)
print("FINAL SUMMARY")
print("="*70)
print(f"Dataset:       {len(df):,} patients")
print(f"Features:      {X.shape[1]}")
print(f"Best Model:    {best}")
print(f"Accuracy:      {results[best]['accuracy']:.4f} ({results[best]['accuracy']*100:.2f}%)")
print(f"ROC-AUC:       {results[best]['roc_auc']:.4f}")
print(f"F1-Score:      {results[best]['f1']:.4f}")
print("="*70)

print("\n📊 Model Comparison:")
for name in results:
    r = results[name]
    print(f"  {name:15s} | Acc: {r['accuracy']:.4f} | AUC: {r['roc_auc']:.4f} | F1: {r['f1']:.4f}")

print("\n✅ Training complete!")

# Detailed explanation of results
print("\n" + "="*70)
print("MODEL PERFORMANCE ANALYSIS")
print("="*70)
print(f"\n✓ Accuracy: {results[best]['accuracy']*100:.2f}%")
print(f"  - The model correctly classifies {results[best]['accuracy']*100:.1f}% of cases")
print(f"  - This is achieved by learning class distribution patterns")

print(f"\n⚠ ROC-AUC: {results[best]['roc_auc']:.4f}")
print("  - ROC-AUC ≈ 0.50 indicates the model performs like random guessing")
print("  - This suggests the available features have LIMITED predictive power")
print("  - Possible reasons:")
print("    • Missing critical medical features (tumor size, genetic markers, etc.)")
print("    • Dataset may be synthetic or have weak feature-target relationships")
print("    • Real-world survival depends on factors not captured in this data")

print(f"\n⚠ F1-Score: {results[best]['f1']:.4f}")
if results[best]['f1'] < 0.3:
    print("  - Low F1-score indicates class imbalance handling challenges")
    print("  - The model tends to predict the majority class (Not Survived)")
    print("  - This inflates accuracy but reduces prediction usefulness")

print("\n" + "="*70)
print("CONCLUSION")
print("="*70)
print("While the model achieves reasonable accuracy, the ROC-AUC score reveals")
print("that the features lack strong discriminative power for predicting survival.")
print("In a real clinical setting, additional features (tumor characteristics,")
print("treatment response, genetic markers) would be needed for reliable predictions.")
print("="*70)