In [5]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
name = ['ID', 'Diagnosis'] + [f'Feature_{i}' for i in range(1, 31)]
data = pd.read_csv(url, header=None, names=name)
data['Diagnosis'] = data['Diagnosis'].map({'M': 1, 'B': 0})

a = data.drop(columns=['ID', 'Diagnosis'])
b = data['Diagnosis']

a_train, a_test, b_train, b_test = train_test_split(a, b, test_size=0.2, random_state=42)

c_or = DecisionTreeClassifier(criterion='gini', min_samples_leaf=2, min_samples_split=5, max_depth=2, random_state=42)
c_or.fit(a_train, b_train)
b_pr_or = c_or.predict(a_test)

f1_or = f1_score(b_test, b_pr_or)
pr_or = precision_score(b_test, b_pr_or)
recall_original = recall_score(b_test, b_pr_or)

pca_1 = PCA(n_components=1)
a_tr_pca1 = pca_1.fit_transform(a_train)
a_te_pca1 = pca_1.transform(a_test)

c_pca1 = DecisionTreeClassifier(criterion='gini', min_samples_leaf=2, min_samples_split=5, max_depth=2, random_state=42)
c_pca1.fit(a_tr_pca1, b_train)
y_pred_pca1 = c_pca1.predict(a_te_pca1)

f1_pca1 = f1_score(b_test, y_pred_pca1)
pr_pca1 = precision_score(b_test, y_pred_pca1)
recall_pca1 = recall_score(b_test, y_pred_pca1)

pca_2 = PCA(n_components=2)
a_tr_pca2 = pca_2.fit_transform(a_train)
a_te_pca2 = pca_2.transform(a_test)

c_pca2 = DecisionTreeClassifier(criterion='gini', min_samples_leaf=2, min_samples_split=5, max_depth=2, random_state=42)
c_pca2.fit(a_tr_pca2, b_train)
y_pr_pca2 = c_pca2.predict(a_te_pca2)

f1_pca2 = f1_score(b_test, y_pr_pca2)
pr_pca2 = precision_score(b_test, y_pr_pca2)
recall_pca2 = recall_score(b_test, y_pr_pca2)

conf_matrix = confusion_matrix(b_test, y_pr_pca2)
TP = conf_matrix[1, 1]
FP = conf_matrix[0, 1]
TN = conf_matrix[0, 0]
FN = conf_matrix[1, 0]
FPR = FP / (FP + TN)
TPR = TP / (TP + FN)

print("Model Performance Comparison")
print(f"Original Data: F1 Score = {f1_or:.4f}, Precision = {pr_or:.4f}, Recall = {recall_original:.4f}")
print(f"PCA-1: F1 Score = {f1_pca1:.4f}, Precision = {pr_pca1:.4f}, Recall = {recall_pca1:.4f}")
print(f"PCA-2: F1 Score = {f1_pca2:.4f}, Precision = {pr_pca2:.4f}, Recall = {recall_pca2:.4f}")

print("\nConfusion Matrix Results")
print(f"False Positives (FP): {FP}")
print(f"True Positives (TP): {TP}")
print(f"False Positive Rate (FPR): {FPR:.4f}")
print(f"True Positive Rate (TPR): {TPR:.4f}")


Model Performance Comparison
Original Data: F1 Score = 0.9024, Precision = 0.9487, Recall = 0.8605
PCA-1: F1 Score = 0.9250, Precision = 1.0000, Recall = 0.8605
PCA-2: F1 Score = 0.9250, Precision = 1.0000, Recall = 0.8605

Confusion Matrix Results
False Positives (FP): 0
True Positives (TP): 37
False Positive Rate (FPR): 0.0000
True Positive Rate (TPR): 0.8605
