In [1]:
import numpy as np
import pandas
import matplotlib as mat
import matplotlib.pyplot as plt
import seaborn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier

import kagglehub
from kagglehub import KaggleDatasetAdapter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
##################################################
##### STEP 1: PREPARING FEATURES AND TARGET ######
##################################################

file_path = "breast-cancer.csv"

# Load the dataset
data_frame = kagglehub.dataset_load(
  KaggleDatasetAdapter.PANDAS,
  "yasserh/breast-cancer-dataset",
  file_path,
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

# print("First 5 records:\n", data_frame.head())

data_frame.dropna(inplace=True)

X = data_frame.drop(columns=["id", "diagnosis"])
y = data_frame["diagnosis"] # if this doesn't work use y = data_frame.drop(columns=X.columns)
# y = pd.Seriesy.str.replace("M", "1").str.replace("B", "0")
y.replace("M", "1", inplace=True)
y.replace("B", "0", inplace=True)

y = (pandas.DataFrame)(y)

print(f"X: {X}")
print(f"y: {y}")

X:      radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
0          17.99         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0          0.08474   
2          19.69         21.25          130.00     1203.0          0.10960   
3          11.42         20.38           77.58      386.1          0.14250   
4          20.29         14.34          135.10     1297.0          0.10030   
..           ...           ...             ...        ...              ...   
564        21.56         22.39          142.00     1479.0          0.11100   
565        20.13         28.25          131.20     1261.0          0.09780   
566        16.60         28.08          108.30      858.1          0.08455   
567        20.60         29.33          140.10     1265.0          0.11780   
568         7.76         24.54           47.92      181.0          0.05263   

     compactness_mean  concavity_mean  concave points_mean  

In [3]:
##################################################
############ STEP 2: TRAIN-TEST SPLIT ############
##################################################

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [4]:
##################################################
############# STEP 3: NORMALIZATION ##############
##################################################

scaler = StandardScaler().fit(X_train, y_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print(f"X_train: {X_train}")
print(f"y_train: {y_train}")
print(f"X_val: {X_test}")
print(f"y_val: {y_test}")

X_train: [[-0.60962129  1.28536633 -0.59518281 ...  0.08359549 -0.47405963
  -0.05694559]
 [ 0.71545732  0.5235626   0.72119446 ...  1.02573644  3.19483073
   0.15479597]
 [-0.14654119 -1.42249968 -0.17907956 ... -0.52763965 -0.82895671
   0.20191124]
 ...
 [-0.55382851 -1.39248922 -0.53562372 ... -0.5074399   0.93580548
   0.49180871]
 [-0.59009381 -0.09742287 -0.63367337 ... -0.98452404 -0.01868941
  -0.15006753]
 [-1.33074301  0.54895605 -1.32488286 ... -1.18852608 -0.03165368
  -0.02756783]]
y_train:     diagnosis
445         0
35          1
220         0
207         1
96          0
..        ...
40          1
190         1
130         0
346         0
416         0

[426 rows x 1 columns]
X_val: [[ 1.15343067 -0.17129475  1.06558371 ...  0.65258078  0.50960488
  -0.7847379 ]
 [-0.63751768 -0.22439016 -0.60693256 ... -0.0150903   0.81912699
   0.0062443 ]
 [ 0.17705696 -1.06699126  0.09562152 ... -0.5432135  -0.9035013
  -1.22595852]
 ...
 [-1.09222887 -1.63949832 -1.0757071  ... -0

## Why might imbalanced data be a problem?

Imbalanced data can result in the model overfitting to the training data, as it develops a significant bias.

In [5]:
##################################################
######## STEP 4: HANDLING CLASS IMBALANCE ########
##################################################

X_resampled, y_resampled = SMOTE().fit_resample(X, y)
X_resampled

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,17.990000,10.380000,122.800000,1001.000000,0.118400,0.277600,0.300100,0.147100,0.241900,0.078710,...,25.380000,17.330000,184.600000,2019.000000,0.162200,0.665600,0.711900,0.265400,0.460100,0.118900
1,20.570000,17.770000,132.900000,1326.000000,0.084740,0.078640,0.086900,0.070170,0.181200,0.056670,...,24.990000,23.410000,158.800000,1956.000000,0.123800,0.186600,0.241600,0.186000,0.275000,0.089020
2,19.690000,21.250000,130.000000,1203.000000,0.109600,0.159900,0.197400,0.127900,0.206900,0.059990,...,23.570000,25.530000,152.500000,1709.000000,0.144400,0.424500,0.450400,0.243000,0.361300,0.087580
3,11.420000,20.380000,77.580000,386.100000,0.142500,0.283900,0.241400,0.105200,0.259700,0.097440,...,14.910000,26.500000,98.870000,567.700000,0.209800,0.866300,0.686900,0.257500,0.663800,0.173000
4,20.290000,14.340000,135.100000,1297.000000,0.100300,0.132800,0.198000,0.104300,0.180900,0.058830,...,22.540000,16.670000,152.200000,1575.000000,0.137400,0.205000,0.400000,0.162500,0.236400,0.076780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
709,16.632198,21.277858,109.134613,861.165496,0.095389,0.125658,0.128829,0.061178,0.188023,0.056134,...,19.971385,28.302396,132.743781,1228.034613,0.155286,0.354699,0.499388,0.176586,0.450693,0.082716
710,20.556514,20.026690,133.237155,1324.426611,0.086280,0.085104,0.102297,0.073144,0.173018,0.056501,...,24.873120,26.550035,159.676603,1937.119331,0.123463,0.206717,0.280485,0.179324,0.267965,0.087730
711,17.442338,38.154227,113.282255,921.103687,0.098064,0.127193,0.138358,0.086616,0.181279,0.059408,...,22.449165,43.853468,141.154210,1409.831589,0.137762,0.365860,0.328339,0.206064,0.287675,0.084354
712,18.293760,18.632329,118.618045,1038.473754,0.087970,0.089556,0.087340,0.061994,0.165456,0.054681,...,21.405636,26.114593,139.506758,1423.533459,0.127009,0.250238,0.359484,0.160799,0.313490,0.071654


In [6]:
##################################################
###### STEP 5: VISUALIZE CLASS DISTRIBUTIONS #####
##################################################

# plt.figure(figsize=(12, 5))
figure, axes = plt.subplots(X_train)
plt.legend()
plt.show()

ValueError: Number of rows must be a positive integer, not array([[-0.60962129,  1.28536633, -0.59518281, ...,  0.08359549,
        -0.47405963, -0.05694559],
       [ 0.71545732,  0.5235626 ,  0.72119446, ...,  1.02573644,
         3.19483073,  0.15479597],
       [-0.14654119, -1.42249968, -0.17907956, ..., -0.52763965,
        -0.82895671,  0.20191124],
       ...,
       [-0.55382851, -1.39248922, -0.53562372, ..., -0.5074399 ,
         0.93580548,  0.49180871],
       [-0.59009381, -0.09742287, -0.63367337, ..., -0.98452404,
        -0.01868941, -0.15006753],
       [-1.33074301,  0.54895605, -1.32488286, ..., -1.18852608,
        -0.03165368, -0.02756783]], shape=(426, 30))

<Figure size 640x480 with 0 Axes>

# Step 6: Evaluation Metrics Reflection



In [7]:
##################################################
######### STEP 7: TRAIN-VALIDATION SPLIT #########
##################################################

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20)

In [8]:
##################################################
### STEP 8: MODEL SELECTION WITH MANUAL SEARCH ###
##################################################

# MODELS
# 1. KNN Classifier

knn = KNeighborsClassifier(n_neighbors=5)
knn_model = knn.fit(X_train, np.asarray(y_train))

# 2. Decision Tree
# 3. Support Vector Machine
# 4. Random Forest
# 5. AdaBoost

  return self._fit(X, y)
