# 0. Setting Up The Data

In [79]:
pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [80]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
  
# data (as pandas dataframes) 
df = breast_cancer_wisconsin_diagnostic.data.original 
  
# metadata 
print(breast_cancer_wisconsin_diagnostic.metadata) 
  
# variable information 
print(breast_cancer_wisconsin_diagnostic.variables) 


{'uci_id': 17, 'name': 'Breast Cancer Wisconsin (Diagnostic)', 'repository_url': 'https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic', 'data_url': 'https://archive.ics.uci.edu/static/public/17/data.csv', 'abstract': 'Diagnostic Wisconsin Breast Cancer Database.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 569, 'num_features': 30, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Diagnosis'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1993, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5DW2B', 'creators': ['William Wolberg', 'Olvi Mangasarian', 'Nick Street', 'W. Street'], 'intro_paper': {'ID': 230, 'type': 'NATIVE', 'title': 'Nuclear feature extraction for breast tumor diagnosis', 'authors': 'W. Street, W. Wolberg, O. Mangasarian', 'venue': 'Electronic imaging', 'year': 1993, 'journal': None, 'DOI': '1

# 1. Business Understanding

**Problem:** Predict if a breast cancer tumor is malignant or benign based on diagnostic measurements.
**Objective:** Learn to apply the kNN algorithm to classify tumors and evaluate performance.

# 2. Data Understanding

In [81]:
df.info()
print(df.describe().T)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  569 non-null    int64  
 1   radius1             569 non-null    float64
 2   texture1            569 non-null    float64
 3   perimeter1          569 non-null    float64
 4   area1               569 non-null    float64
 5   smoothness1         569 non-null    float64
 6   compactness1        569 non-null    float64
 7   concavity1          569 non-null    float64
 8   concave_points1     569 non-null    float64
 9   symmetry1           569 non-null    float64
 10  fractal_dimension1  569 non-null    float64
 11  radius2             569 non-null    float64
 12  texture2            569 non-null    float64
 13  perimeter2          569 non-null    float64
 14  area2               569 non-null    float64
 15  smoothness2         569 non-null    float64
 16  compactn

The dataseet contains 569 instances and 32 columns:
- ID: Unique identifier
- Diagnosis: Target variable (M = Malignant, B = Beningn)
- 30 numeric features: measurements of breast tumors

**Observations:**
- No null values
- Distribution: 212 malignant, 357 benign.
- Values are not normalized

# 3. Data Preparation

In [82]:
# Drop ID column
df = df.drop(columns=["ID"])

# Map diagnosis to binary values: Malignant = 1, Benign = 0
df["Diagnosis"] = df["Diagnosis"].map({"M":1, "B":0})

# Split features and target variable
features = df.drop(columns=["Diagnosis"])
labels = df["Diagnosis"]

# Normalize features
features = (features - features.mean()) / features.std()
features.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
radius1,569.0,-1.311195e-16,1.0,-2.027864,-0.688779,-0.214893,0.46898,3.967796
texture1,569.0,6.243785e-17,1.0,-2.227289,-0.725325,-0.104544,0.583662,4.647799
perimeter1,569.0,-1.123881e-16,1.0,-1.982759,-0.691347,-0.235773,0.499238,3.972634
area1,569.0,-2.185325e-16,1.0,-1.453164,-0.666609,-0.294927,0.363188,5.245913
smoothness1,569.0,-8.366672e-16,1.0,-3.109349,-0.710338,-0.03486,0.63564,4.766717
compactness1,569.0,1.873136e-16,1.0,-1.608721,-0.746429,-0.221745,0.493423,4.564409
concavity1,569.0,2.497514e-17,1.0,-1.113893,-0.743094,-0.341939,0.525599,4.239858
concave_points1,569.0,-4.995028e-17,1.0,-1.26071,-0.737295,-0.397372,0.646366,3.924477
symmetry1,569.0,1.74826e-16,1.0,-2.741705,-0.702621,-0.071564,0.530313,4.480808
fractal_dimension1,569.0,4.838933e-16,1.0,-1.818265,-0.722004,-0.178123,0.470569,4.906602


# 4. Modeling

In [83]:
# Space for code or markdown cell
# Modeling code and operations here

# 5. Evaluation

In [84]:
# Space for code or markdown cell
# Rambling about how well we did here

# 6. Deployment

In [85]:
# Space for code or markdown cell
# Rambling about what we did here.