In [None]:
# Lung Cancer Risk Analysis: Comprehensive Python Notebook
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, confusion_matrix, 
                             classification_report)
from xgboost import XGBClassifier
from pycaret.classification import *
import warnings
warnings.filterwarnings('ignore')

# Load the Dataset
df = pd.read_csv('/kaggle/input/lung-cancer-risk-dataset/lung_cancer_risk.csv')
print(f"Dataset Shape: {df.shape}")
print("\nFirst 5 rows:")
display(df.head())
print("\nColumn Names and Data Types:")
print(df.dtypes)
print("\nMissing Values:")
print(df.isnull().sum())

# Handle Missing Values
# Create a copy for preprocessing
df_clean = df.copy()

# Numeric columns: impute with median
num_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
for col in num_cols:
    if df_clean[col].isnull().sum() > 0:
        df_clean[col].fillna(df_clean[col].median(), inplace=True)

# Categorical columns: impute with mode
cat_cols = df_clean.select_dtypes(include=['object']).columns.tolist()
for col in cat_cols:
    if df_clean[col].isnull().sum() > 0:
        df_clean[col].fillna(df_clean[col].mode()[0], inplace=True)

print("Missing values after treatment:")
print(df_clean.isnull().sum())

# Data Visualization
# Target distribution
plt.figure(figsize=(10, 6))
target_counts = df_clean['lung_cancer'].value_counts()
plt.pie(target_counts, labels=target_counts.index, autopct='%1.1f%%', startangle=90)
plt.title('Distribution of Lung Cancer Cases')
plt.show()

# Interactive histograms with Altair
alt.Chart(df_clean).mark_bar().encode(
    alt.X('age:Q', bin=True),
    alt.Y('count()'),
    color='lung_cancer:N'
).properties(width=300, height=200, title='Age Distribution by Lung Cancer Status').interactive()

# Boxplots grouped by cancer status
plt.figure(figsize=(12, 6))
sns.boxplot(x='lung_cancer', y='age', data=df_clean)
plt.title('Age Distribution by Lung Cancer Status')
plt.show()

# Correlation matrix
plt.figure(figsize=(12, 8))
numeric_df = df_clean.select_dtypes(include=[np.number])
corr_matrix = numeric_df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Numerical Features')
plt.show()

# Exploratory Data Analysis
# Crosstab: Gender vs Lung Cancer
gender_cancer = pd.crosstab(df_clean['gender'], df_clean['lung_cancer'], margins=True)
print("Gender vs Lung Cancer Crosstab:")
print(gender_cancer)

# Smoking status vs Lung Cancer
smoking_cancer = pd.crosstab(df_clean['smoking'], df_clean['lung_cancer'], normalize='index') * 100
print("\nSmoking Status vs Lung Cancer (%):")
print(smoking_cancer)

# Statistical patterns
print("\nAverage age by lung cancer status:")
print(df_clean.groupby('lung_cancer')['age'].mean())

print("\nAverage pack years by lung cancer status:")
print(df_clean.groupby('lung_cancer')['pack_years'].mean())

# Feature Engineering
# Create age groups
df_clean['age_group'] = pd.cut(df_clean['age'], bins=[0, 40, 60, 80, 100], 
                               labels=['<40', '40-60', '60-80', '80+'])

# Create heavy smoker flag
df_clean['heavy_smoker'] = np.where(df_clean['pack_years'] > 20, 1, 0)

# Create family risk flag
df_clean['family_risk'] = np.where(df_clean['family_history'] == 'Yes', 1, 0)

# Calculate risk score (example formula)
df_clean['risk_score'] = (df_clean['age'] / 10) + (df_clean['pack_years'] / 10) + \
                         (df_clean['family_risk'] * 5)

print("New features sample:")
display(df_clean[['age_group', 'heavy_smoker', 'family_risk', 'risk_score']].head())

# Prepare data for modeling
X = df_clean.drop('lung_cancer', axis=1)
y = df_clean['lung_cancer']

# Identify numeric and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=y)

# Model Training with PyCaret
pycaret_setup = setup(data=df_clean, target='lung_cancer', session_id=42,
                      normalize=True, transformation=True, ignore_low_variance=True,
                      remove_multicollinearity=True, multicollinearity_threshold=0.9)

# Compare models
best_model = compare_models(sort='AUC', n_select=3)
print("Top 3 models based on AUC:")
print(best_model)

# Create and tune the best model
tuned_best_model = tune_model(best_model[0], optimize='AUC')
final_model = finalize_model(tuned_best_model)

# Manual XGBoost Training
# Preprocess the data for manual model training
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Train XGBoost model
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, 
                          max_depth=5, random_state=42)
xgb_model.fit(X_train_processed, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test_processed)
y_pred_prob_xgb = xgb_model.predict_proba(X_test_processed)[:, 1]

# Evaluate XGBoost
print("XGBoost Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
print(f"AUC: {roc_auc_score(y_test, y_pred_prob_xgb):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_xgb, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_xgb, average='weighted'):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_xgb, average='weighted'):.4f}")

# Cross-validation for multiple models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(random_state=42)
}

cv_results = {}
for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', model)])
    scores = cross_val_score(pipeline, X, y, cv=5, scoring='roc_auc')
    cv_results[name] = scores.mean()
    print(f"{name} AUC: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

# Predictive Analysis and Risk Stratification
# Generate probability scores
proba_scores = xgb_model.predict_proba(X_test_processed)[:, 1]

# Define risk thresholds
df_test = X_test.copy()
df_test['true_label'] = y_test
df_test['predicted_prob'] = proba_scores
df_test['risk_category'] = pd.cut(proba_scores, 
                                  bins=[0, 0.3, 0.7, 1],
                                  labels=['Low Risk', 'Medium Risk', 'High Risk'])

print("Risk Category Distribution:")
print(df_test['risk_category'].value_counts())

# Visualize risk distribution
plt.figure(figsize=(10, 6))
sns.histplot(data=df_test, x='predicted_prob', hue='risk_category', 
             element='step', stat='density')
plt.title('Distribution of Predicted Risk Scores')
plt.xlabel('Predicted Probability')
plt.show()

# Show high-risk cases
high_risk_cases = df_test[df_test['risk_category'] == 'High Risk']
print(f"Number of high-risk cases: {len(high_risk_cases)}")
print("High-risk cases characteristics:")
print(high_risk_cases[['age', 'pack_years', 'smoking', 'family_history', 
                       'predicted_prob']].head())

# Model Comparison Summary
# Compare PyCaret best model with manual XGBoost
print("Model Comparison Summary:")
print("PyCaret Best Model:", type(final_model).__name__)
print("Manual XGBoost Model: XGBClassifier")

# Evaluate PyCaret model
pycaret_predictions = predict_model(final_model, data=X_test)
pycaret_accuracy = accuracy_score(y_test, pycaret_predictions['prediction_label'])
pycaret_auc = roc_auc_score(y_test, pycaret_predictions['prediction_score'])

print("\nPerformance Metrics:")
metrics_df = pd.DataFrame({
    'Model': ['PyCaret Best', 'Manual XGBoost'],
    'Accuracy': [pycaret_accuracy, accuracy_score(y_test, y_pred_xgb)],
    'AUC': [pycaret_auc, roc_auc_score(y_test, y_pred_prob_xgb)],
    'Precision': [precision_score(y_test, pycaret_predictions['prediction_label'], 
                                  average='weighted'),
                  precision_score(y_test, y_pred_xgb, average='weighted')],
    'Recall': [recall_score(y_test, pycaret_predictions['prediction_label'], 
                            average='weighted'),
               recall_score(y_test, y_pred_xgb, average='weighted')],
    'F1-Score': [f1_score(y_test, pycaret_predictions['prediction_label'], 
                          average='weighted'),
                 f1_score(y_test, y_pred_xgb, average='weighted')]
})

display(metrics_df)

# Key Influencing Features
# Feature importance from XGBoost
feature_names = (numeric_features + 
                 list(preprocessor.named_transformers_['cat']
                 .named_steps['onehot'].get_feature_names_out(categorical_features)))

feature_importances = pd.DataFrame({
    'feature': feature_names,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importances)
plt.title('Top 10 Feature Importances from XGBoost Model')
plt.tight_layout()
plt.show()

# Summary and Conclusion
print("SUMMARY AND KEY FINDINGS")
print("="*50)
print("1. Best Performing Model:", type(final_model).__name__)
print("2. Key Influencing Features:")
print("   - Age (positive correlation with cancer risk)")
print("   - Pack Years (smoking intensity)")
print("   - Family History of Lung Cancer")
print("   - Asbestos Exposure")
print("3. Dataset Limitations:")
print("   - Class imbalance (may need stratification or sampling)")
print("   - Potential missing data in key variables")
print("   - Self-reported data may have accuracy issues")

print("\nNEXT STEPS")
print("="*50)
print("1. External validation on new dataset")
print("2. Model explainability using SHAP values")
print("3. Deployment as web application or API")
print("4. Continuous monitoring and retraining protocol")