In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Machine learning models and metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import StandardScaler

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducibility
np.random.seed(42)

# Configure matplotlib for better plots
plt.style.use('default')
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
lr_report = classification_report(y_test, lr_predictions, 
                                target_names=data.target_names, 
                                digits=4)

lr_report_dict = classification_report(y_test, lr_predictions, 
                                     target_names=data.target_names, 
                                     output_dict=True)

In [None]:
dt_report = classification_report(y_test, dt_predictions, 
                                target_names=data.target_names, 
                                digits=4)

dt_report_dict = classification_report(y_test, dt_predictions, 
                                     target_names=data.target_names, 
                                     output_dict=True)

In [None]:
def plot_confusion_matrix(y_true, y_pred, model_name, class_names):
    """
    Plot a confusion matrix with proper labels and formatting
    """
    cm = confusion_matrix(y_true, y_pred)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=class_names, yticklabels=class_names)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

# Plot confusion matrix for Logistic Regression
plot_confusion_matrix(y_test, lr_predictions, 
                     "Logistic Regression", data.target_names)

# Plot confusion matrix for Decision Tree
plot_confusion_matrix(y_test, dt_predictions, 
                     "Decision Tree", data.target_names)

In [None]:
def plot_roc_curve(y_true, y_prob, model_name):
    """
    Plot ROC curve and calculate AUC score
    """
    fpr, tpr, thresholds = roc_curve(y_true, y_prob)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, 
             label=f'{model_name} (AUC = {roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', 
             label='Random Classifier')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_name}')
    plt.legend(loc="lower right")
    plt.grid(True, alpha=0.3)
    plt.show()
    
    return roc_auc

# Plot ROC curve for Logistic Regression
lr_auc = plot_roc_curve(y_test, lr_probabilities, "Logistic Regression")

# Plot ROC curve for Decision Tree
dt_auc = plot_roc_curve(y_test, dt_probabilities, "Decision Tree")

# Compare both models on the same plot
plt.figure(figsize=(10, 8))

# Logistic Regression ROC
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probabilities)
plt.plot(lr_fpr, lr_tpr, color='blue', lw=2, 
         label=f'Logistic Regression (AUC = {lr_auc:.4f})')

# Decision Tree ROC
dt_fpr, dt_tpr, _ = roc_curve(y_test, dt_probabilities)
plt.plot(dt_fpr, dt_tpr, color='red', lw=2, 
         label=f'Decision Tree (AUC = {dt_auc:.4f})')

# Random classifier line
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--', 
         label='Random Classifier (AUC = 0.5)')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.show()

Comprehensive Model Comparison

In [None]:
def create_model_comparison(models_data):
    """
    Create a comprehensive comparison of multiple models
    """
    comparison_df = pd.DataFrame(models_data)
    
    print("=" * 60)
    print("COMPREHENSIVE MODEL COMPARISON")
    print("=" * 60)
    print(comparison_df.to_string(index=False))
    
    return comparison_df

# Prepare comparison data
models_comparison = {
    'Model': ['Logistic Regression', 'Decision Tree'],
    'Precision (Malignant)': [
        lr_report_dict['malignant']['precision'],
        dt_report_dict['malignant']['precision']
    ],
    'Recall (Malignant)': [
        lr_report_dict['malignant']['recall'],
        dt_report_dict['malignant']['recall']
    ],
    'F1-Score (Malignant)': [
        lr_report_dict['malignant']['f1-score'],
        dt_report_dict['malignant']['f1-score']
    ],
    'Overall Accuracy': [
        lr_report_dict['accuracy'],
        dt_report_dict['accuracy']
    ],
    'AUC Score': [lr_auc, dt_auc]
}

comparison_df = create_model_comparison(models_comparison)

In [None]:
# Create a bar plot comparing key metrics
metrics = ['Precision (Malignant)', 'Recall (Malignant)', 'F1-Score (Malignant)', 'Overall Accuracy', 'AUC Score']
lr_scores = [comparison_df.iloc[0][metric] for metric in metrics]
dt_scores = [comparison_df.iloc[1][metric] for metric in metrics]

x = np.arange(len(metrics))
width = 0.35

fig, ax = plt.subplots(figsize=(12, 8))
bars1 = ax.bar(x - width/2, lr_scores, width, label='Logistic Regression', color='skyblue')
bars2 = ax.bar(x + width/2, dt_scores, width, label='Decision Tree', color='lightcoral')

ax.set_xlabel('Metrics')
ax.set_ylabel('Scores')
ax.set_title('Model Performance Comparison')
ax.set_xticks(x)
ax.set_xticklabels(metrics, rotation=45, ha='right')
ax.legend()
ax.set_ylim(0, 1.1)

# Add value labels on bars
def add_value_labels(bars):
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{height:.3f}', ha='center', va='bottom')

add_value_labels(bars1)
add_value_labels(bars2)

plt.tight_layout()
plt.show()

In [None]:
def recommend_best_model(comparison_df):
    """
    Provide model recommendation based on performance metrics
    """
    print("\n" + "=" * 50)
    print("MODEL SELECTION RECOMMENDATION")
    print("=" * 50)
    
    # Analyze each metric
    lr_row = comparison_df.iloc[0]
    dt_row = comparison_df.iloc[1]
    
    print("Metric Analysis:")
    print(f"• Precision (Malignant): {'Logistic Regression' if lr_row['Precision (Malignant)'] > dt_row['Precision (Malignant)'] else 'Decision Tree'} wins")
    print(f"• Recall (Malignant): {'Logistic Regression' if lr_row['Recall (Malignant)'] > dt_row['Recall (Malignant)'] else 'Decision Tree'} wins")
    print(f"• F1-Score (Malignant): {'Logistic Regression' if lr_row['F1-Score (Malignant)'] > dt_row['F1-Score (Malignant)'] else 'Decision Tree'} wins")
    print(f"• Overall Accuracy: {'Logistic Regression' if lr_row['Overall Accuracy'] > dt_row['Overall Accuracy'] else 'Decision Tree'} wins")
    print(f"• AUC Score: {'Logistic Regression' if lr_row['AUC Score'] > dt_row['AUC Score'] else 'Decision Tree'} wins")
    
    # Overall recommendation
    lr_wins = sum([
        lr_row['Precision (Malignant)'] > dt_row['Precision (Malignant)'],
        lr_row['Recall (Malignant)'] > dt_row['Recall (Malignant)'],
        lr_row['F1-Score (Malignant)'] > dt_row['F1-Score (Malignant)'],
        lr_row['Overall Accuracy'] > dt_row['Overall Accuracy'],
        lr_row['AUC Score'] > dt_row['AUC Score']
    ])
    
    print(f"\nOverall Winner: {'Logistic Regression' if lr_wins > 2 else 'Decision Tree'}")
    print(f"Logistic Regression wins in {lr_wins}/5 metrics")
    print(f"Decision Tree wins in {5-lr_wins}/5 metrics")
    
    # Context-specific recommendations
    print("\nContext-Specific Recommendations:")
    print("• For medical diagnosis (high recall important): Choose the model with higher recall")
    print("• For balanced performance: Choose the model with higher F1-score")
    print("• For probability estimates: Choose the model with higher AUC score")

recommend_best_model(comparison_df)

Advanced Analysis and Insights

In [None]:
# Analyze feature importance for Decision Tree
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': dt_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features (Decision Tree):")
print(feature_importance.head(10))

# Plot feature importance
plt.figure(figsize=(10, 8))
top_features = feature_importance.head(15)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 15 Feature Importances - Decision Tree')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Analyze misclassified samples
def analyze_errors(y_true, y_pred, model_name):
    """
    Analyze misclassified samples
    """
    errors = y_true != y_pred
    error_count = np.sum(errors)
    
    print(f"\n{model_name} Error Analysis:")
    print(f"Total misclassified samples: {error_count}")
    print(f"Error rate: {error_count/len(y_true):.4f}")
    
    # False positives and false negatives
    false_positives = np.sum((y_true == 0) & (y_pred == 1))
    false_negatives = np.sum((y_true == 1) & (y_pred == 0))
    
    print(f"False Positives (Benign predicted as Malignant): {false_positives}")
    print(f"False Negatives (Malignant predicted as Benign): {false_negatives}")
    
    return errors

lr_errors = analyze_errors(y_test, lr_predictions, "Logistic Regression")
dt_errors = analyze_errors(y_test, dt_predictions, "Decision Tree")

Troubleshooting Common Issues

Issue 1: Import Errors
If you encounter import errors, ensure all required packages are installed:
# Run this if you get import errors
import sys
!{sys.executable} -m pip install scikit-learn pandas numpy matplotlib seaborn

Issue 2: Memory Issues with Large Datasets
For larger datasets, consider using sample data:
# If working with large datasets, sample the data
if X.shape[0] > 10000:
    from sklearn.utils import resample
    X_sample, y_sample = resample(X, y, n_samples=5000, random_state=42)
    X, y = X_sample, y_sample

Issue 3: Convergence Warnings
If you get convergence warnings with Logistic Regression:
# Increase max_iter or change solver
lr_model = LogisticRegression(random_state=42, max_iter=2000, solver='liblinear')