In [1]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [2]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17)

# data (as pandas dataframes)
X = breast_cancer_wisconsin_diagnostic.data.features
y = breast_cancer_wisconsin_diagnostic.data.targets

# metadata
print(breast_cancer_wisconsin_diagnostic.metadata)

# variable information
print(breast_cancer_wisconsin_diagnostic.variables)

{'uci_id': 17, 'name': 'Breast Cancer Wisconsin (Diagnostic)', 'repository_url': 'https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic', 'data_url': 'https://archive.ics.uci.edu/static/public/17/data.csv', 'abstract': 'Diagnostic Wisconsin Breast Cancer Database.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 569, 'num_features': 30, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Diagnosis'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1993, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5DW2B', 'creators': ['William Wolberg', 'Olvi Mangasarian', 'Nick Street', 'W. Street'], 'intro_paper': {'ID': 230, 'type': 'NATIVE', 'title': 'Nuclear feature extraction for breast tumor diagnosis', 'authors': 'W. Street, W. Wolberg, O. Mangasarian', 'venue': 'Electronic imaging', 'year': 1993, 'journal': None, 'DOI': '1

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
accuracy_score,
precision_score,
recall_score,
f1_score,
confusion_matrix
)

In [4]:
y = y.iloc[:, 0].map({'M': 1, 'B': 0})
print(y.value_counts())

Diagnosis
0    357
1    212
Name: count, dtype: int64


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=0.2,
random_state=42,
stratify=y
)
print("Training samples:", X_train.shape[0])
print("Test samples:", X_test.shape[0])

Training samples: 455
Test samples: 114


In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
log_model = LogisticRegression(max_iter=500)
log_model.fit(X_train_scaled, y_train)
# Predictions
y_train_pred_log = log_model.predict(X_train_scaled)
y_test_pred_log = log_model.predict(X_test_scaled)
# Errors
train_error_log = 1 - accuracy_score(y_train, y_train_pred_log)
test_error_log = 1 - accuracy_score(y_test, y_test_pred_log)
print("LOGISTIC REGRESSION RESULTS")
print("Train Error:", round(train_error_log, 4))
print("Test Error:", round(test_error_log, 4))
print("Accuracy:", round(accuracy_score(y_test, y_test_pred_log), 4))
print("Precision:", round(precision_score(y_test, y_test_pred_log), 4))
print("Recall:", round(recall_score(y_test, y_test_pred_log), 4))
print("F1-score:", round(f1_score(y_test, y_test_pred_log), 4))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred_log))

LOGISTIC REGRESSION RESULTS
Train Error: 0.0132
Test Error: 0.0351
Accuracy: 0.9649
Precision: 0.975
Recall: 0.9286
F1-score: 0.9512
Confusion Matrix:
 [[71  1]
 [ 3 39]]


In [8]:
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)
# Predictions
y_train_pred_tree = tree_model.predict(X_train)
y_test_pred_tree = tree_model.predict(X_test)
train_error_tree = 1 - accuracy_score(y_train, y_train_pred_tree)
test_error_tree = 1 - accuracy_score(y_test, y_test_pred_tree)
print("DECISION TREE RESULTS")
print("Train Error:", round(train_error_tree, 4))
print("Test Error:", round(test_error_tree, 4))
print("Accuracy:", round(accuracy_score(y_test, y_test_pred_tree), 4))
print("Precision:", round(precision_score(y_test, y_test_pred_tree), 4))
print("Recall:", round(recall_score(y_test, y_test_pred_tree), 4))
print("F1-score:", round(f1_score(y_test, y_test_pred_tree), 4))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred_tree))

DECISION TREE RESULTS
Train Error: 0.0
Test Error: 0.0702
Accuracy: 0.9298
Precision: 0.9048
Recall: 0.9048
F1-score: 0.9048
Confusion Matrix:
 [[68  4]
 [ 4 38]]


In [9]:
comparison = pd.DataFrame({
"Model": ["Logistic Regression", "Decision Tree"],
"Train Error": [train_error_log, train_error_tree],
"Test Error": [test_error_log, test_error_tree]
})

In [10]:
comparison

Unnamed: 0,Model,Train Error,Test Error
0,Logistic Regression,0.013187,0.035088
1,Decision Tree,0.0,0.070175
