# Imports

In [2]:
import pandas as pd
import numpy

In [3]:
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing

# Read data to dataframe

In [4]:
df = pd.read_csv("breast-cancer-wisconsin")

In [5]:
df

Unnamed: 0,id,clump_thickness,uniform_cell_size,uniform_cell_shape,marginal_adhesion,single_epi_cell_size,bare_nuclei,bland_chromation,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


# Clean up
- Check nulls
- Check dtypes
- Remove weird values from column bare_nuclei
- Change dtype of bare_nuclei from object to int64

In [6]:
# Check if there is null values
df.isnull().values.any()

False

In [7]:
# Drop missing values (there is not but lets do this anyway)
df.dropna(inplace = True)

In [8]:
# Drop column "id"
del df['id']

In [9]:
df

Unnamed: 0,clump_thickness,uniform_cell_size,uniform_cell_shape,marginal_adhesion,single_epi_cell_size,bare_nuclei,bland_chromation,normal_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1,2
695,2,1,1,1,2,1,1,1,1,2
696,5,10,10,3,7,3,8,10,2,4
697,4,8,6,4,3,4,10,6,1,4


In [10]:
df.dtypes

clump_thickness          int64
uniform_cell_size        int64
uniform_cell_shape       int64
marginal_adhesion        int64
single_epi_cell_size     int64
bare_nuclei             object
bland_chromation         int64
normal_nucleoli          int64
mitoses                  int64
class                    int64
dtype: object

In [11]:
# There is weird values in bare_nuclei column. Lets drop them out. (Weird value is "?")
df.drop(df.loc[df['bare_nuclei']=="?"].index, inplace=True)

In [12]:
df = df.astype({"bare_nuclei":'int64'})

# Create features and labels arrays X and y

In [13]:
x = df[df.columns[:-1]]
y = df["class"]

# Scale/Standardize the features array X (use Minmax or Standard scaling)

In [14]:
scaler = MinMaxScaler()
scaled = scaler.fit_transform(x)
print(scaled)

[[0.44444444 0.         0.         ... 0.22222222 0.         0.        ]
 [0.44444444 0.33333333 0.33333333 ... 0.22222222 0.11111111 0.        ]
 [0.22222222 0.         0.         ... 0.22222222 0.         0.        ]
 ...
 [0.44444444 1.         1.         ... 0.77777778 1.         0.11111111]
 [0.33333333 0.77777778 0.55555556 ... 1.         0.55555556 0.        ]
 [0.33333333 0.77777778 0.77777778 ... 1.         0.33333333 0.        ]]


# Transform the feature array y to a binary array 0 or 1

In [15]:
lb = preprocessing.LabelBinarizer()

In [16]:
lb.fit(y)

LabelBinarizer()

In [17]:
lb.transform(y)

array([[0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
    

# Split the arrays into training and test arrays (4 resulting arrays)


In [18]:
from sklearn.model_selection import train_test_split

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=426, test_size=143, random_state=0)