In [1]:
#Import Data and Visualization libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [3]:
#Load and have a brief look at the data
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,id,clump_thickness,cell_size,cell_shape,marginal_adhesion,epithelial_cell_size,bare_nuclei,bland_chromatin,nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [4]:
#Get information of data
data.shape

(699, 11)

## Data Preprocessing

#### We are classifying on class column. Let us shed more light on this

In [5]:
data['class'].value_counts()

2    458
4    241
Name: class, dtype: int64

In [6]:
#Next let us check the data types
data.dtypes

id                       int64
clump_thickness          int64
cell_size                int64
cell_shape               int64
marginal_adhesion        int64
epithelial_cell_size     int64
bare_nuclei             object
bland_chromatin          int64
nucleoli                 int64
mitoses                  int64
class                    int64
dtype: object

#### The bare nuclei field is shown as an object. This could be because it holds some non numeric values like NaN. We need to have a glance at it and see more

In [8]:
data['bare_nuclei']

0       1
1      10
2       2
3       4
4       1
       ..
694     2
695     1
696     3
697     4
698     5
Name: bare_nuclei, Length: 699, dtype: object

In [11]:
#List all non numeric rows
data[data['bare_nuclei'] == '?']

Unnamed: 0,id,clump_thickness,cell_size,cell_shape,marginal_adhesion,epithelial_cell_size,bare_nuclei,bland_chromatin,nucleoli,mitoses,class
23,1057013,8,4,5,1,2,?,7,3,1,4
40,1096800,6,6,6,9,6,?,7,8,1,2
139,1183246,1,1,1,1,1,?,2,1,1,2
145,1184840,1,1,3,1,2,?,2,1,1,2
158,1193683,1,1,2,1,3,?,1,1,1,2
164,1197510,5,1,1,1,2,?,3,1,1,2
235,1241232,3,1,4,1,2,?,3,1,1,2
249,169356,3,1,1,1,2,?,3,1,1,2
275,432809,3,1,3,1,2,?,2,1,1,2
292,563649,8,8,8,1,2,?,6,10,1,4


<p> 19 rows affected out of 699. We can replace with median or mean

In [14]:
#Create a data frame from these 19 rows in order to explore them further. Where there is no digit, i returns false
digits_in_bare_nuclei = pd.DataFrame(data.bare_nuclei.str.isdigit())

In [13]:
digits_in_bare_nuclei

Unnamed: 0,bare_nuclei
0,True
1,True
2,True
3,True
4,True
...,...
694,True
695,True
696,True
697,True


In [15]:
# In order to handle '?', we change it to NaN
df = data.replace('?', np.nan)

In [16]:
#See the result
df['bare_nuclei']

0       1
1      10
2       2
3       4
4       1
       ..
694     2
695     1
696     3
697     4
698     5
Name: bare_nuclei, Length: 699, dtype: object

In [17]:
#Replace Nan with the median value
df = df.fillna(df.median())

In [21]:
df['bare_nuclei']

id                      0.0
clump_thickness         0.0
cell_size               0.0
cell_shape              0.0
marginal_adhesion       0.0
epithelial_cell_size    0.0
bare_nuclei             0.0
bland_chromatin         0.0
nucleoli                0.0
mitoses                 0.0
class                   0.0
dtype: float64


In [19]:
df.dtypes

id                       int64
clump_thickness          int64
cell_size                int64
cell_shape               int64
marginal_adhesion        int64
epithelial_cell_size     int64
bare_nuclei             object
bland_chromatin          int64
nucleoli                 int64
mitoses                  int64
class                    int64
dtype: object