In [None]:
import pandas as pd
import numpy as np

# a) Download and upload the data set breast-cancer-wisconsin.data.txt enclosed with this exercise

In [None]:
df = pd.read_csv("breast-cancer-wisconsin.data%20%281%29.txt", delimiter=",")
df

Unnamed: 0,id,clump_thickness,uniform_cell_size,uniform_cell_shape,marginal_adhesion,single_epi_cell_size,bare_nuclei,bland_chromation,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [None]:
df.shape

(699, 11)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    699 non-null    int64 
 1   clump_thickness       699 non-null    int64 
 2   uniform_cell_size     699 non-null    int64 
 3   uniform_cell_shape    699 non-null    int64 
 4   marginal_adhesion     699 non-null    int64 
 5   single_epi_cell_size  699 non-null    int64 
 6   bare_nuclei           699 non-null    object
 7   bland_chromation      699 non-null    int64 
 8   normal_nucleoli       699 non-null    int64 
 9   mitoses               699 non-null    int64 
 10  class                 699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


It could be seen from the dataset information above that the `bare_nuclei` column that is expected to be numeric came up non-numeric. Hence, there's a need to investigate further to see if there's an invalid value in the column:

In [None]:
df['bare_nuclei'].unique()

array(['1', '10', '2', '4', '3', '9', '7', '?', '5', '8', '6'],
      dtype=object)

As suspected, it turned out missing values were represented with question mark `"?"`. Therefore, I will have to first replace all occurence of `?` with `NaN`, then convert the column to numeric data type:

In [None]:
# replace '?' with NaN
df.replace(to_replace='?', value=np.nan, inplace=True)
# convert the column to numeric data type
df['bare_nuclei'] = pd.to_numeric(df['bare_nuclei'])

# check if the change is effective
df['bare_nuclei'].unique()

array([ 1., 10.,  2.,  4.,  3.,  9.,  7., nan,  5.,  8.,  6.])

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    699 non-null    int64  
 1   clump_thickness       699 non-null    int64  
 2   uniform_cell_size     699 non-null    int64  
 3   uniform_cell_shape    699 non-null    int64  
 4   marginal_adhesion     699 non-null    int64  
 5   single_epi_cell_size  699 non-null    int64  
 6   bare_nuclei           683 non-null    float64
 7   bland_chromation      699 non-null    int64  
 8   normal_nucleoli       699 non-null    int64  
 9   mitoses               699 non-null    int64  
 10  class                 699 non-null    int64  
dtypes: float64(1), int64(10)
memory usage: 60.2 KB


Perfect!

# b) Drop the missing or non-numeric values

In [None]:
df.isna().sum()

id                       0
clump_thickness          0
uniform_cell_size        0
uniform_cell_shape       0
marginal_adhesion        0
single_epi_cell_size     0
bare_nuclei             16
bland_chromation         0
normal_nucleoli          0
mitoses                  0
class                    0
dtype: int64

* All columns are numeric
* Only `bare_nuclei` has 16 missing records and accordding to the instruction, the rows corresponding to missing bare_nuclei would be dropped:

In [None]:
# drop the rows
df.dropna(inplace=True, axis=0)

# confirm
df.isna().sum()

id                      0
clump_thickness         0
uniform_cell_size       0
uniform_cell_shape      0
marginal_adhesion       0
single_epi_cell_size    0
bare_nuclei             0
bland_chromation        0
normal_nucleoli         0
mitoses                 0
class                   0
dtype: int64

In [None]:
print(df.shape)

(683, 11)


* Notice the shape of the data has been reduced after removing rows corresponding to missing `normal_nucleoli`

# c) Drop the ID column

In [None]:
# drop id column
df.drop(columns=["id"], inplace=True)

# check if it's removed
df.columns

Index(['clump_thickness', 'uniform_cell_size', 'uniform_cell_shape',
       'marginal_adhesion', 'single_epi_cell_size', 'bare_nuclei',
       'bland_chromation', 'normal_nucleoli', 'mitoses', 'class'],
      dtype='object')

# d) Create features and labels arrays X and y

In [None]:
# create features
feat_cols = ['clump_thickness', 'uniform_cell_size', 'uniform_cell_shape', 'marginal_adhesion', 'single_epi_cell_size', 'bare_nuclei',
             'bland_chromation', 'normal_nucleoli', 'mitoses']
X = df[feat_cols].values
# create target variable
y = df['class'].values

In [None]:
X

array([[ 5.,  1.,  1., ...,  3.,  1.,  1.],
       [ 5.,  4.,  4., ...,  3.,  2.,  1.],
       [ 3.,  1.,  1., ...,  3.,  1.,  1.],
       ...,
       [ 5., 10., 10., ...,  8., 10.,  2.],
       [ 4.,  8.,  6., ..., 10.,  6.,  1.],
       [ 4.,  8.,  8., ..., 10.,  4.,  1.]])

In [None]:
y

array([2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 4, 4, 2, 2, 4, 2, 4, 4,
       2, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 4, 4, 4, 4, 4, 4, 2,
       4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 2, 4, 2, 4,
       4, 2, 2, 4, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 2, 4, 2, 4,
       4, 4, 2, 2, 2, 4, 2, 2, 2, 2, 4, 4, 4, 2, 4, 2, 4, 2, 2, 2, 4, 2,
       2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 4, 2, 2, 4, 2, 4, 4, 2, 2, 4, 2, 2,
       4, 4, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 4, 4, 4, 2, 4, 2, 4, 2, 2,
       2, 4, 4, 2, 4, 4, 4, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2,
       2, 4, 4, 2, 2, 2, 4, 4, 2, 4, 4, 4, 2, 2, 4, 2, 2, 4, 4, 4, 4, 2,
       4, 4, 2, 4, 4, 4, 2, 4, 2, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 4, 4, 2,
       2, 4, 2, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4, 4, 2, 4, 4, 4, 2, 4, 2,
       4, 4, 2, 2, 2, 2, 4, 2, 2, 4, 4, 4, 4, 4, 2, 4, 4, 2, 2, 4, 4, 2,
       2, 4, 4, 2, 4, 2, 4, 4, 2, 2, 4, 2, 2, 2, 4,

In [None]:
X.shape

(683, 9)

In [None]:
np.unique(y)

array([2, 4])

# e) Scale/Standardize the features array X (use Minmax or Standard scaling)

In [None]:
from sklearn.preprocessing import MinMaxScaler # import MinMaxScaler module
scaler = MinMaxScaler() # instantiate the scaler
X_scaled = scaler.fit_transform(X) # transform the features

In [None]:
X_scaled

array([[0.44444444, 0.        , 0.        , ..., 0.22222222, 0.        ,
        0.        ],
       [0.44444444, 0.33333333, 0.33333333, ..., 0.22222222, 0.11111111,
        0.        ],
       [0.22222222, 0.        , 0.        , ..., 0.22222222, 0.        ,
        0.        ],
       ...,
       [0.44444444, 1.        , 1.        , ..., 0.77777778, 1.        ,
        0.11111111],
       [0.33333333, 0.77777778, 0.55555556, ..., 1.        , 0.55555556,
        0.        ],
       [0.33333333, 0.77777778, 0.77777778, ..., 1.        , 0.33333333,
        0.        ]])


# f) Transform the feature array y to a binary array 0 or 1

In the target class, according to the data dictionary; 2 represents benign while 4 stands for malignant. However, we will transform the variable to represent benign with `0` and malignant with `1`:

In [None]:
# Transforming 2s to 0s and 4s to 1s
y_binary = np.where(y == 2, 0, 1)

In [None]:
y_binary

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,

# # g) Split the arrays into training and test arrays (4 resulting arrays)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_binary, test_size=0.2, stratify=y_binary)

In [None]:
X_train.shape

(546, 9)