In [1]:
# Install dataset utility
!pip install ucimlrepo


Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [2]:
import numpy as np
import pandas as pd

from ucimlrepo import fetch_ucirepo

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)


In [4]:
# Fetch dataset
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17)

# Features and target
X = breast_cancer_wisconsin_diagnostic.data.features
y = breast_cancer_wisconsin_diagnostic.data.targets

# Display metadata
print(breast_cancer_wisconsin_diagnostic.metadata)

# Display variable information
print(breast_cancer_wisconsin_diagnostic.variables)


{'uci_id': 17, 'name': 'Breast Cancer Wisconsin (Diagnostic)', 'repository_url': 'https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic', 'data_url': 'https://archive.ics.uci.edu/static/public/17/data.csv', 'abstract': 'Diagnostic Wisconsin Breast Cancer Database.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 569, 'num_features': 30, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Diagnosis'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1993, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5DW2B', 'creators': ['William Wolberg', 'Olvi Mangasarian', 'Nick Street', 'W. Street'], 'intro_paper': {'ID': 230, 'type': 'NATIVE', 'title': 'Nuclear feature extraction for breast tumor diagnosis', 'authors': 'W. Street, W. Wolberg, O. Mangasarian', 'venue': 'Electronic imaging', 'year': 1993, 'journal': None, 'DOI': '1

In [5]:
print("Feature shape:", X.shape)
print("Target shape:", y.shape)

print("\nClass distribution:")
print(y.value_counts())


Feature shape: (569, 30)
Target shape: (569, 1)

Class distribution:
Diagnosis
B            357
M            212
Name: count, dtype: int64


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [7]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [8]:
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, y_train)


  y = column_or_1d(y, warn=True)


In [9]:
# Predictions
y_train_pred = log_model.predict(X_train_scaled)
y_test_pred = log_model.predict(X_test_scaled)

# Errors
train_error = 1 - accuracy_score(y_train, y_train_pred)
test_error = 1 - accuracy_score(y_test, y_test_pred)

print("Logistic Regression:")
print("Train Error:", train_error)
print("Test Error:", test_error)
print("Generalization Gap:", test_error - train_error)


Logistic Regression:
Train Error: 0.01318681318681314
Test Error: 0.03508771929824561
Generalization Gap: 0.021900906111432472


In [10]:
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))



Classification Report (Test Set):
              precision    recall  f1-score   support

           B       0.96      0.99      0.97        72
           M       0.97      0.93      0.95        42

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

Confusion Matrix:
[[71  1]
 [ 3 39]]


In [11]:
tree_model = DecisionTreeClassifier(
    max_depth=None,
    random_state=42
)

tree_model.fit(X_train, y_train)


In [12]:
# Predictions
y_train_pred_tree = tree_model.predict(X_train)
y_test_pred_tree = tree_model.predict(X_test)

# Errors
train_error_tree = 1 - accuracy_score(y_train, y_train_pred_tree)
test_error_tree = 1 - accuracy_score(y_test, y_test_pred_tree)

print("Decision Tree:")
print("Train Error:", train_error_tree)
print("Test Error:", test_error_tree)
print("Generalization Gap:", test_error_tree - train_error_tree)


Decision Tree:
Train Error: 0.0
Test Error: 0.07017543859649122
Generalization Gap: 0.07017543859649122


In [13]:
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred_tree))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred_tree))



Classification Report (Test Set):
              precision    recall  f1-score   support

           B       0.94      0.94      0.94        72
           M       0.90      0.90      0.90        42

    accuracy                           0.93       114
   macro avg       0.92      0.92      0.92       114
weighted avg       0.93      0.93      0.93       114

Confusion Matrix:
[[68  4]
 [ 4 38]]


| Model               | Train Error | Test Error      | Overfitting |
| ------------------- | ----------- | --------------- | ----------- |
| Logistic Regression | Low         | Slightly higher |  No        |
| Decision Tree       | ~0          | Much higher     |  Yes       |


Logistic Regression shows good generalization, as the training and test errors are close. This indicates a low variance, high bias model that performs well on unseen data. Feature scaling significantly improves its performance because the algorithm is distance-based.

Decision Tree achieves nearly zero training error but significantly worse test error, indicating overfitting. The model memorizes training data due to its high flexibility and lack of regularization (no depth limit).

# Logistic Regression

Training accuracy ≈ Test accuracy

Small generalization gap

 No overfitting

# Decision Tree

Very high training accuracy (~100%)

Lower test accuracy

Large generalization gap

 Clear overfitting

| Model               | Training Performance | Test Performance | Observation         |
| ------------------- | -------------------- | ---------------- | ------------------- |
| Logistic Regression | High                 | High (similar)   | Good generalization |
| Decision Tree       | Very High            | Lower            | Overfitting         |


# ML Issues Relevant to This Dataset

**1.Feature Scaling**

Logistic Regression requires standardized features to converge properly and avoid dominance of large-scale variables.

**2. Class Imbalance**

More Benign than Malignant samples → accuracy alone is misleading. Precision, Recall, and F1-score are necessary.

**3. Data Leakage**

Scaling before train–test split would leak test information into training.

**4. Feature Correlation**

Many radius/texture/area features are highly correlated, affecting linear model assumptions.