In [1]:
# --- Step 1: Import necessary libraries ---
import pandas as pd
import numpy as np

In [2]:
# --- Step 2: Define column names ---
# These names are taken directly from the UCI dataset documentation.
column_names = [
    'id', 'clump_thickness', 'uniformity_of_cell_size', 'uniformity_of_cell_shape',
    'marginal_adhesion', 'single_epithelial_cell_size', 'bare_nuclei',
    'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class'
]


In [3]:
# --- Step 3: Load the dataset ---
# The path is relative to the notebook's current location.
data_path = '../dataset/breast-cancer-wisconsin.data'

try:
    df = pd.read_csv(data_path, names=column_names)
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file '{data_path}' was not found. Please ensure it's in the correct 'dataset' folder.")
    print("Your current working directory for this notebook is likely in 'notebooks/'.")
    print("Check if the file is in 'classification_project/dataset/breast-cancer-wisconsin.data'")
    exit() 

Dataset loaded successfully!


In [4]:
# --- Step 4: Initial Data Inspection ---
print("\n--- First 5 rows of the dataset ---")
print(df.head())


--- First 5 rows of the dataset ---
        id  clump_thickness  uniformity_of_cell_size  \
0  1000025                5                        1   
1  1002945                5                        4   
2  1015425                3                        1   
3  1016277                6                        8   
4  1017023                4                        1   

   uniformity_of_cell_shape  marginal_adhesion  single_epithelial_cell_size  \
0                         1                  1                            2   
1                         4                  5                            7   
2                         1                  1                            2   
3                         8                  1                            3   
4                         1                  3                            2   

  bare_nuclei  bland_chromatin  normal_nucleoli  mitoses  class  
0           1                3                1        1      2  
1          10      

In [5]:
print("\n--- Descriptive Statistics ---")
print(df.describe())


--- Descriptive Statistics ---
                 id  clump_thickness  uniformity_of_cell_size  \
count  6.990000e+02       699.000000               699.000000   
mean   1.071704e+06         4.417740                 3.134478   
std    6.170957e+05         2.815741                 3.051459   
min    6.163400e+04         1.000000                 1.000000   
25%    8.706885e+05         2.000000                 1.000000   
50%    1.171710e+06         4.000000                 1.000000   
75%    1.238298e+06         6.000000                 5.000000   
max    1.345435e+07        10.000000                10.000000   

       uniformity_of_cell_shape  marginal_adhesion  \
count                699.000000         699.000000   
mean                   3.207439           2.806867   
std                    2.971913           2.855379   
min                    1.000000           1.000000   
25%                    1.000000           1.000000   
50%                    1.000000           1.000000   
75% 

In [6]:
print("\n--- Dataset Info (data types, non-null counts) ---")
print(df.info())


--- Dataset Info (data types, non-null counts) ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   id                           699 non-null    int64 
 1   clump_thickness              699 non-null    int64 
 2   uniformity_of_cell_size      699 non-null    int64 
 3   uniformity_of_cell_shape     699 non-null    int64 
 4   marginal_adhesion            699 non-null    int64 
 5   single_epithelial_cell_size  699 non-null    int64 
 6   bare_nuclei                  699 non-null    object
 7   bland_chromatin              699 non-null    int64 
 8   normal_nucleoli              699 non-null    int64 
 9   mitoses                      699 non-null    int64 
 10  class                        699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB
None


In [7]:
print("\n--- Class Distribution ---")
print(df['class'].value_counts())


--- Class Distribution ---
2    458
4    241
Name: class, dtype: int64


In [8]:
# --- Step 5: Handle Missing Values in 'bare_nuclei' ---
# As confirmed, '?' denotes missing values in this column.
# First, replace '?' with numpy's NaN.
df['bare_nuclei'] = df['bare_nuclei'].replace('?', np.nan)

In [9]:
# Convert the column to numeric type. This is crucial before imputation.
# 'errors='coerce'' will turn any values that cannot be converted to numeric into NaN.
df['bare_nuclei'] = pd.to_numeric(df['bare_nuclei'], errors='coerce')

In [10]:
# Check for remaining missing values
print("\n--- Missing values after '?' replacement and type conversion ---")
print(df.isnull().sum())


--- Missing values after '?' replacement and type conversion ---
id                              0
clump_thickness                 0
uniformity_of_cell_size         0
uniformity_of_cell_shape        0
marginal_adhesion               0
single_epithelial_cell_size     0
bare_nuclei                    16
bland_chromatin                 0
normal_nucleoli                 0
mitoses                         0
class                           0
dtype: int64


In [11]:
# Impute missing values with the median (a robust choice for potentially skewed data)
median_bare_nuclei = df['bare_nuclei'].median()
df['bare_nuclei'].fillna(median_bare_nuclei, inplace=True)

print(f"\nMissing values in 'bare_nuclei' filled with median: {median_bare_nuclei}")
print("--- Missing values after imputation ---")
print(df.isnull().sum())


Missing values in 'bare_nuclei' filled with median: 1.0
--- Missing values after imputation ---
id                             0
clump_thickness                0
uniformity_of_cell_size        0
uniformity_of_cell_shape       0
marginal_adhesion              0
single_epithelial_cell_size    0
bare_nuclei                    0
bland_chromatin                0
normal_nucleoli                0
mitoses                        0
class                          0
dtype: int64


In [12]:
# --- Step 6: Drop the 'id' column ---
# The 'id' column is a unique identifier and not a predictive feature.
df.drop('id', axis=1, inplace=True)
print("\n'id' column dropped.")
print(df.head())


'id' column dropped.
   clump_thickness  uniformity_of_cell_size  uniformity_of_cell_shape  \
0                5                        1                         1   
1                5                        4                         4   
2                3                        1                         1   
3                6                        8                         8   
4                4                        1                         1   

   marginal_adhesion  single_epithelial_cell_size  bare_nuclei  \
0                  1                            2          1.0   
1                  5                            7         10.0   
2                  1                            2          2.0   
3                  1                            3          4.0   
4                  3                            2          1.0   

   bland_chromatin  normal_nucleoli  mitoses  class  
0                3                1        1      2  
1                3                

In [13]:
# --- Step 8: Transform the 'class' target variable to 0 and 1 ---
# 2 = benign (map to 0)
# 4 = malignant (map to 1)
df['class'] = df['class'].map({2: 0, 4: 1})
print("\n--- Class distribution after mapping (0 for benign, 1 for malignant) ---")
print(df['class'].value_counts())


print("\n--- Preprocessing steps completed. Data is now clean and ready for splitting and scaling. ---")
print(df.head())
print(df.info())


--- Class distribution after mapping (0 for benign, 1 for malignant) ---
0    458
1    241
Name: class, dtype: int64

--- Preprocessing steps completed. Data is now clean and ready for splitting and scaling. ---
   clump_thickness  uniformity_of_cell_size  uniformity_of_cell_shape  \
0                5                        1                         1   
1                5                        4                         4   
2                3                        1                         1   
3                6                        8                         8   
4                4                        1                         1   

   marginal_adhesion  single_epithelial_cell_size  bare_nuclei  \
0                  1                            2          1.0   
1                  5                            7         10.0   
2                  1                            2          2.0   
3                  1                            3          4.0   
4                 