In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score, roc_curve, auc
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import xgboost as xgb
import lightgbm as lgb

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import shap
from scipy import stats
from datetime import datetime
import joblib
import os

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
pd.set_option('display.max_columns', None)
np.random.seed(42)
tf.random.set_seed(42)

print("Libraries imported")
print(f"TensorFlow: {tf.__version__}")
print(f"Pandas: {pd.__version__}")

In [None]:
for directory in ['data', 'models', 'results/figures', 'results/metrics', 'results/reports']:
    os.makedirs(directory, exist_ok=True)
print("Directories created")

In [None]:
column_names = ['id', 'diagnosis'] + [f'feature_{i}' for i in range(1, 31)]

feature_names = [
    'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
    'compactness_mean', 'concavity_mean', 'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean',
    'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
    'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se', 'fractal_dimension_se',
    'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst',
    'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst'
]

df = pd.read_csv('/home/abdur/By Ashir/gitgub/breast cancer/breast+cancer+wisconsin+diagnostic/wdbc.data', header=None, names=['id', 'diagnosis'] + feature_names)

print(f"Shape: {df.shape}")
print(f"\nFirst 3 rows:")
display(df.head(3))
print(f"\nData types:")
print(df.dtypes)
print(f"\nMissing values: {df.isnull().sum().sum()}")

In [None]:
print("Target Distribution:")
print(df['diagnosis'].value_counts())
print(f"\nMalignant: {(df['diagnosis']=='M').sum()} ({(df['diagnosis']=='M').mean()*100:.1f}%)")
print(f"Benign: {(df['diagnosis']=='B').sum()} ({(df['diagnosis']=='B').mean()*100:.1f}%)")

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

counts = df['diagnosis'].value_counts()
axes[0].bar(['Benign', 'Malignant'], [counts['B'], counts['M']], 
            color=['lightgreen', 'lightcoral'], edgecolor='black', alpha=0.7)
axes[0].set_ylabel('Count')
axes[0].set_title('Diagnosis Distribution', fontweight='bold')
axes[0].grid(axis='y', alpha=0.3)

axes[1].pie([counts['B'], counts['M']], labels=['Benign', 'Malignant'], 
            autopct='%1.1f%%', colors=['lightgreen', 'lightcoral'], startangle=90)
axes[1].set_title('Diagnosis Proportion', fontweight='bold')

plt.tight_layout()
plt.savefig('results/figures/01_diagnosis_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
df['diagnosis_binary'] = (df['diagnosis'] == 'M').astype(int)

mean_features = [col for col in df.columns if '_mean' in col]
worst_features = [col for col in df.columns if '_worst' in col]

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for ax, feature in zip(axes.flat, ['radius_mean', 'texture_mean', 'area_mean', 'concavity_mean']):
    df[df['diagnosis']=='B'][feature].hist(ax=ax, bins=30, alpha=0.6, label='Benign', color='green')
    df[df['diagnosis']=='M'][feature].hist(ax=ax, bins=30, alpha=0.6, label='Malignant', color='red')
    ax.set_xlabel(feature)
    ax.set_ylabel('Frequency')
    ax.set_title(f'{feature} Distribution', fontweight='bold')
    ax.legend()
    ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('results/figures/02_feature_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
corr_features = mean_features[:10]
corr_matrix = df[corr_features + ['diagnosis_binary']].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Feature Correlation Matrix (Mean Features)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('results/figures/03_correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nTop correlations with diagnosis:")
diagnosis_corr = df[feature_names + ['diagnosis_binary']].corr()['diagnosis_binary'].sort_values(ascending=False)
print(diagnosis_corr.head(10))

In [None]:
X = df[feature_names]
y = df['diagnosis_binary']

print(f"Features: {X.shape}")
print(f"Target: {y.shape}")
print(f"\nClass distribution:\n{y.value_counts()}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

print(f"Train: {X_train.shape[0]} ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Validation: {X_val.shape[0]} ({X_val.shape[0]/len(X)*100:.1f}%)")
print(f"Test: {X_test.shape[0]} ({X_test.shape[0]/len(X)*100:.1f}%)")

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=feature_names)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=feature_names)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=feature_names)

joblib.dump(scaler, 'models/scaler.pkl')
print("Features scaled")

In [None]:
def evaluate_model(model, name, X_train, y_train, X_val, y_val, X_test, y_test):
    print(f"\nTraining: {name}")
    
    start = datetime.now()
    model.fit(X_train, y_train)
    train_time = (datetime.now() - start).total_seconds()
    
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)
    
    if hasattr(model, 'predict_proba'):
        y_test_proba = model.predict_proba(X_test)[:, 1]
    else:
        y_test_proba = y_test_pred
    
    results = {
        'Model': name,
        'Train_Acc': accuracy_score(y_train, y_train_pred),
        'Val_Acc': accuracy_score(y_val, y_val_pred),
        'Test_Acc': accuracy_score(y_test, y_test_pred),
        'Precision': precision_score(y_test, y_test_pred),
        'Recall': recall_score(y_test, y_test_pred),
        'F1': f1_score(y_test, y_test_pred),
        'AUC': roc_auc_score(y_test, y_test_proba),
        'Time(s)': train_time
    }
    
    print(f"Results:")
    for k, v in results.items():
        if k != 'Model':
            print(f"  {k:12s}: {v:.4f}")
    
    return model, results, y_test_pred, y_test_proba

In [None]:
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss', n_jobs=-1),
    'LightGBM': lgb.LGBMClassifier(random_state=42, verbose=-1, n_jobs=-1)
}

all_results = []
trained_models = {}
predictions = {}

for name, model in models.items():
    trained, results, y_pred, y_proba = evaluate_model(
        model, name, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test
    )
    
    all_results.append(results)
    trained_models[name] = trained
    predictions[name] = {'pred': y_pred, 'proba': y_proba}
    joblib.dump(trained, f"models/{name.replace(' ', '_').lower()}.pkl")

results_df = pd.DataFrame(all_results).sort_values('Test_Acc', ascending=False)
print("\nModel Comparison:")
display(results_df)

results_df.to_csv('results/metrics/ml_models_comparison.csv', index=False)

In [None]:
best_model_name = results_df.iloc[0]['Model']
best_pred = predictions[best_model_name]['pred']
best_proba = predictions[best_model_name]['proba']

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

cm = confusion_matrix(y_test, best_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['Benign', 'Malignant'], yticklabels=['Benign', 'Malignant'])
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('True')
axes[0].set_title(f'{best_model_name}\nConfusion Matrix', fontweight='bold')

fpr, tpr, _ = roc_curve(y_test, best_proba)
roc_auc = auc(fpr, tpr)
axes[1].plot(fpr, tpr, linewidth=2, label=f'AUC = {roc_auc:.3f}')
axes[1].plot([0, 1], [0, 1], 'k--', alpha=0.3)
axes[1].set_xlabel('False Positive Rate')
axes[1].set_ylabel('True Positive Rate')
axes[1].set_title(f'{best_model_name}\nROC Curve', fontweight='bold')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('results/figures/04_best_model_performance.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nBest Model: {best_model_name}")
print(f"Test Accuracy: {results_df.iloc[0]['Test_Acc']:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, best_pred, target_names=['Benign', 'Malignant']))

In [None]:
def build_nn_model(input_dim):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(64, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(32, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(16, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy', keras.metrics.AUC(name='auc')]
    )
    
    return model

nn_model = build_nn_model(X_train_scaled.shape[1])
print("Neural Network Architecture:")
nn_model.summary()

In [None]:
print("Training Neural Network...")

history = nn_model.fit(
    X_train_scaled, y_train,
    validation_data=(X_val_scaled, y_val),
    epochs=50,
    batch_size=32,
    callbacks=[
        keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True),
        keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5, verbose=1)
    ],
    verbose=1
)

nn_model.save('models/neural_network.keras')
print("Neural network trained")

In [None]:
y_test_nn_proba = nn_model.predict(X_test_scaled).flatten()
y_test_nn_pred = (y_test_nn_proba > 0.5).astype(int)

nn_results = {
    'Model': 'Neural Network',
    'Test_Acc': accuracy_score(y_test, y_test_nn_pred),
    'Precision': precision_score(y_test, y_test_nn_pred),
    'Recall': recall_score(y_test, y_test_nn_pred),
    'F1': f1_score(y_test, y_test_nn_pred),
    'AUC': roc_auc_score(y_test, y_test_nn_proba)
}

print("\nNeural Network Results:")
for k, v in nn_results.items():
    if k != 'Model':
        print(f"  {k:12s}: {v:.4f}")

all_results.append(nn_results)
predictions['Neural Network'] = {'pred': y_test_nn_pred, 'proba': y_test_nn_proba}

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(history.history['accuracy'], label='Train')
axes[0].plot(history.history['val_accuracy'], label='Validation')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Model Accuracy', fontweight='bold')
axes[0].legend()
axes[0].grid(alpha=0.3)

axes[1].plot(history.history['loss'], label='Train')
axes[1].plot(history.history['val_loss'], label='Validation')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].set_title('Model Loss', fontweight='bold')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('results/figures/05_nn_training_history.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
best_model = trained_models[best_model_name]
risk_scores = best_model.predict_proba(X_test_scaled)[:, 1] * 100

def categorize_risk(score):
    if score < 25:
        return 'Low'
    elif score < 50:
        return 'Medium'
    elif score < 75:
        return 'High'
    else:
        return 'Very High'

risk_categories = [categorize_risk(s) for s in risk_scores]
risk_dist = pd.Series(risk_categories).value_counts()

plt.figure(figsize=(10, 6))
risk_dist.plot(kind='bar', color=['green', 'yellow', 'orange', 'red'], edgecolor='black', alpha=0.7)
plt.xlabel('Risk Category')
plt.ylabel('Count')
plt.title('Malignancy Risk Distribution', fontsize=14, fontweight='bold')
plt.xticks(rotation=0)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('results/figures/06_risk_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("Risk Distribution:")
print(risk_dist)

In [None]:
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_test_scaled)

kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_test_scaled)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

scatter1 = axes[0].scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis', alpha=0.6, s=30)
axes[0].set_xlabel('PC1')
axes[0].set_ylabel('PC2')
axes[0].set_title('Patient Clusters', fontweight='bold')
plt.colorbar(scatter1, ax=axes[0], label='Cluster')

scatter2 = axes[1].scatter(X_pca[:, 0], X_pca[:, 1], c=risk_scores, cmap='RdYlGn_r', alpha=0.6, s=30)
axes[1].set_xlabel('PC1')
axes[1].set_ylabel('PC2')
axes[1].set_title('Risk Scores', fontweight='bold')
plt.colorbar(scatter2, ax=axes[1], label='Risk Score')

plt.tight_layout()
plt.savefig('results/figures/07_clustering.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"PCA explained variance: {pca.explained_variance_ratio_.sum()*100:.1f}%")

In [None]:
def generate_recommendation(risk_score, tumor_features):
    plan = {
        'risk_level': categorize_risk(risk_score),
        'recommendations': []
    }
    
    if risk_score >= 75:
        plan['screening'] = 'Immediate biopsy recommended'
        plan['followup'] = 'Weekly monitoring'
        plan['recommendations'].append('Urgent oncology consultation')
        plan['recommendations'].append('Complete diagnostic workup')
    elif risk_score >= 50:
        plan['screening'] = 'Biopsy within 2 weeks'
        plan['followup'] = 'Bi-weekly monitoring'
        plan['recommendations'].append('Specialist evaluation required')
        plan['recommendations'].append('Additional imaging recommended')
    elif risk_score >= 25:
        plan['screening'] = 'Follow-up in 1 month'
        plan['followup'] = 'Monthly monitoring'
        plan['recommendations'].append('Continue regular screenings')
    else:
        plan['screening'] = 'Routine annual screening'
        plan['followup'] = 'Annual checkup'
        plan['recommendations'].append('Maintain healthy lifestyle')
    
    plan['lifestyle'] = [
        'Maintain healthy BMI (18.5-25)',
        'Regular physical activity (150 min/week)',
        'Limit alcohol consumption',
        'Balanced diet rich in fruits/vegetables'
    ]
    
    return plan

print("Recommendation engine created")

In [None]:
sample_indices = [0, 10, 30, 50, 80]

print("\nSample Patient Recommendations:")
print("="*70)

for idx in sample_indices:
    if idx < len(X_test):
        patient = X_test.iloc[idx]
        risk = risk_scores[idx]
        
        plan = generate_recommendation(risk, patient)
        
        print(f"\nPATIENT #{idx+1}")
        print(f"Risk Score: {risk:.1f}/100 ({plan['risk_level']} Risk)")
        print(f"Screening: {plan['screening']}")
        print(f"Follow-up: {plan['followup']}")
        print(f"\nRecommendations:")
        for i, rec in enumerate(plan['recommendations'], 1):
            print(f"  {i}. {rec}")
        print("-"*70)

In [None]:
print("Calculating SHAP values...")

if 'Random Forest' in trained_models:
    explainer = shap.TreeExplainer(trained_models['Random Forest'])
elif 'XGBoost' in trained_models:
    explainer = shap.TreeExplainer(trained_models['XGBoost'])
else:
    explainer = shap.TreeExplainer(trained_models['LightGBM'])

shap_values = explainer.shap_values(X_test_scaled)

plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X_test_scaled, feature_names=feature_names, show=False)
plt.title('Feature Importance (SHAP)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('results/figures/08_shap_importance.png', dpi=300, bbox_inches='tight')
plt.show()

print("SHAP analysis complete")

In [None]:
final_results_df = pd.DataFrame(all_results).sort_values('Test_Acc', ascending=False)
final_results_df.to_csv('results/metrics/final_model_comparison.csv', index=False)

patient_report = pd.DataFrame({
    'Patient_ID': range(len(y_test)),
    'True_Diagnosis': y_test.values,
    'Predicted_Diagnosis': best_pred,
    'Risk_Score': risk_scores,
    'Risk_Category': risk_categories,
    'Cluster': clusters
})

patient_report.to_csv('results/reports/patient_risk_assessment.csv', index=False)

summary = f"""
BREAST CANCER PREDICTION - FINAL REPORT
{'='*70}

DATASET:
  Total samples: {len(df)}
  Malignant: {(df['diagnosis']=='M').sum()} ({(df['diagnosis']=='M').mean()*100:.1f}%)
  Test set: {len(y_test)}

BEST MODEL: {best_model_name}
  Test Accuracy: {results_df.iloc[0]['Test_Acc']*100:.2f}%
  Precision: {results_df.iloc[0]['Precision']:.3f}
  Recall: {results_df.iloc[0]['Recall']:.3f}
  F1-Score: {results_df.iloc[0]['F1']:.3f}
  AUC-ROC: {results_df.iloc[0]['AUC']:.3f}

RISK STRATIFICATION:
  Low: {risk_dist.get('Low', 0)}
  Medium: {risk_dist.get('Medium', 0)}
  High: {risk_dist.get('High', 0)}
  Very High: {risk_dist.get('Very High', 0)}

FILES GENERATED:
  âœ… 8 visualizations
  âœ… Model comparison
  âœ… Patient assessments
  âœ… SHAP analysis

{'='*70}
PROJECT COMPLETE
"""

print(summary)

with open('results/reports/final_summary.txt', 'w') as f:
    f.write(summary)

print("\nâœ… All results exported")
print("âœ… Models saved")
print("ðŸŽ‰ PROJECT COMPLETE")