In [None]:
# Problem 1: Logistic Regression Model
# Classification of Cancer Dataset (Malignant vs. Benign)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, f1_score

# Set the random seed for reproducibility
np.random.seed(42)

# Load the breast cancer dataset
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target

print(f"Dataset shape: {X.shape}")
print(f"Number of features: {X.shape[1]}")
print(f"Number of samples: {X.shape[0]}")
print(f"Number of classes: {len(np.unique(y))}")
print(f"Class distribution: {np.bincount(y)}")

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform feature scaling and standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Part (i): Build a logistic regression model with all 30 features
print("\n--- Part (i): Logistic Regression with all features ---")

# Create and train the logistic regression model
logreg = LogisticRegression(random_state=42, max_iter=1000)
logreg.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = logreg.predict(X_test_scaled)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Malignant', 'Benign']))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Malignant', 'Benign'], yticklabels=['Malignant', 'Benign'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Logistic Regression (No Weight Penalty)')
plt.savefig('confusion_matrix_logreg.png')
plt.show()

# Part (ii): Add weight penalty (L2 regularization) and repeat training
print("\n--- Part (ii): Logistic Regression with weight penalty (L2 regularization) ---")

# Create and train the logistic regression model with L2 regularization
# Note: Default regularization in LogisticRegression is L2 ('ridge')
# We'll set the C parameter to control regularization strength (lower C = stronger regularization)
logreg_l2 = LogisticRegression(random_state=42, max_iter=1000, C=0.1)
logreg_l2.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_l2 = logreg_l2.predict(X_test_scaled)

# Calculate metrics
accuracy_l2 = accuracy_score(y_test, y_pred_l2)
precision_l2 = precision_score(y_test, y_pred_l2)
recall_l2 = recall_score(y_test, y_pred_l2)
f1_l2 = f1_score(y_test, y_pred_l2)

# Print results
print(f"Accuracy (L2): {accuracy_l2:.4f}")
print(f"Precision (L2): {precision_l2:.4f}")
print(f"Recall (L2): {recall_l2:.4f}")
print(f"F1 Score (L2): {f1_l2:.4f}")
print("\nClassification Report (L2):")
print(classification_report(y_test, y_pred_l2, target_names=['Malignant', 'Benign']))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
cm_l2 = confusion_matrix(y_test, y_pred_l2)
sns.heatmap(cm_l2, annot=True, fmt='d', cmap='Blues', xticklabels=['Malignant', 'Benign'], yticklabels=['Malignant', 'Benign'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Logistic Regression (With L2 Regularization)')
plt.savefig('confusion_matrix_logreg_l2.png')
plt.show()

# Compare the two models
print("\n--- Comparison: No Regularization vs. L2 Regularization ---")
print(f"Accuracy: {accuracy:.4f} vs {accuracy_l2:.4f}")
print(f"Precision: {precision:.4f} vs {precision_l2:.4f}")
print(f"Recall: {recall:.4f} vs {recall_l2:.4f}")
print(f"F1 Score: {f1:.4f} vs {f1_l2:.4f}")

# Plot feature importance for both models
plt.figure(figsize=(14, 8))
feature_names = cancer.feature_names

# For the model without regularization
plt.subplot(1, 2, 1)
coef = logreg.coef_[0]
indices = np.argsort(np.abs(coef))[::-1]
plt.barh(range(len(indices)), coef[indices])
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.title('Feature Importance - No Regularization')
plt.xlabel('Coefficient Value')

# For the model with L2 regularization
plt.subplot(1, 2, 2)
coef_l2 = logreg_l2.coef_[0]
indices_l2 = np.argsort(np.abs(coef_l2))[::-1]
plt.barh(range(len(indices_l2)), coef_l2[indices_l2])
plt.yticks(range(len(indices_l2)), [feature_names[i] for i in indices_l2])
plt.title('Feature Importance - L2 Regularization')
plt.xlabel('Coefficient Value')

plt.tight_layout()
plt.savefig('feature_importance_comparison.png')
plt.show()

# Display a sample of the dataset
print("\nSample of the dataset:")
df_cancer = pd.DataFrame(X, columns=cancer.feature_names)
df_cancer['target'] = y
df_cancer['diagnosis'] = ['Malignant' if t == 0 else 'Benign' for t in y]
print(df_cancer.head())