## Import libraries

In [1]:
import numpy as np
import pandas as pd 

## Download dataset for data preprocessing

In [2]:
data = pd.read_csv("Dataset/breast-cancer-dataset.csv")
## 
df = data.copy()

## Data Preprocessing
- Normalize the column names
- Drop the s/n column
- Replace special caracters (#) with NaN
- Fill all NaN values
- Replace categorical columns(diagnosis_result) with integers


In [3]:
df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('-','_')
df.columns

Index(['s/n', 'year', 'age', 'menopause', 'tumor_size_(cm)', 'inv_nodes',
       'breast', 'metastasis', 'breast_quadrant', 'history',
       'diagnosis_result'],
      dtype='object')

In [4]:
df.columns = df.columns.str.replace('_(cm)', '')
df.columns

Index(['s/n', 'year', 'age', 'menopause', 'tumor_size', 'inv_nodes', 'breast',
       'metastasis', 'breast_quadrant', 'history', 'diagnosis_result'],
      dtype='object')

In [5]:
df = df.drop(['s/n'], axis=1)

In [6]:
for col in df:
  print(df[col].unique())

['2019' '#' '2020']
[40 39 45 26 21 50 58 35 42 41 43 30 33 38 60 47 57 34 31 36 56 48 61 20
 37 27 25 17 23 59 67 46 24 49 53 28 68 64 18 14 15 54 71 55 22 65 19 52
 51 44 63 29 32 62 69 13 16 77]
[1 0]
['2' '4' '3' '1' '5' '6' '7' '10' '8' '9' '#' '14' '12']
['0' '1' '#' '3']
['Right' 'Left' '#']
['0' '1' '#']
['Upper inner' 'Upper outer' 'Lower outer' 'Lower inner' '#'
 'Upper outer ']
['0' '1' '#']
['Benign' 'Malignant']


In [7]:
for col in df:
  print (col)

year
age
menopause
tumor_size
inv_nodes
breast
metastasis
breast_quadrant
history
diagnosis_result


In [8]:
## replacing unsual caracters with NaN
df['year'] = df['year'].replace('#', np.nan)

In [9]:
df['tumor_size'] = df['tumor_size'].replace('#', np.nan)

In [10]:
df['inv_nodes'] = df['inv_nodes'].replace('#', np.nan)

In [11]:
df['breast'] = df['breast'].replace('#', np.nan)

In [12]:
df['metastasis'] = df['metastasis'].replace('#', np.nan)

In [13]:
df['breast_quadrant'] = df['breast_quadrant'].replace('#', np.nan)

In [14]:
df['history'] = df['history'].replace('#', np.nan)

In [15]:
df.isnull().sum()

year                1
age                 0
menopause           0
tumor_size          1
inv_nodes           1
breast              6
metastasis          1
breast_quadrant     2
history             2
diagnosis_result    0
dtype: int64

In [16]:
## target variable 
df.year.value_counts()

year
2020    111
2019    101
Name: count, dtype: int64

In [17]:
## convert the year column type to float
df['year'] = df['year'].astype('float')

In [18]:
df['year'] = df['year'].fillna(df['year'].mean())

In [19]:
df.tumor_size.value_counts()

tumor_size
3     34
2     32
4     32
1     29
5     27
6     17
7     14
8     11
10     7
9      7
14     1
12     1
Name: count, dtype: int64

In [20]:
## convert the year column type to float
df['tumor_size'] = df['tumor_size'].astype('float')

In [21]:
df['tumor_size'] = df['tumor_size'].fillna(df['tumor_size'].mean())

In [22]:
df.inv_nodes.value_counts()

inv_nodes
0    139
1     72
3      1
Name: count, dtype: int64

In [23]:
## convert the year column type to float
df['inv_nodes'] = df['inv_nodes'].astype('float')

In [24]:
df.inv_nodes = df.inv_nodes.fillna(0)

In [25]:
df.breast.value_counts()

breast
Left     107
Right    100
Name: count, dtype: int64

In [26]:
df.breast = df.breast.fillna('Right')

In [27]:
df.metastasis.value_counts()

metastasis
0    139
1     73
Name: count, dtype: int64

In [28]:
df.metastasis = df.metastasis.fillna(0)

In [29]:
df.breast_quadrant.value_counts()

breast_quadrant
Upper outer     67
Lower outer     54
Upper inner     45
Lower inner     44
Upper outer      1
Name: count, dtype: int64

In [30]:
df.breast_quadrant = df.breast_quadrant.fillna('Upper outer')

In [31]:
df.history.value_counts()

history
0    124
1     87
Name: count, dtype: int64

In [32]:
df.history = df.history.fillna(0)

In [33]:
df.isnull().sum()

year                0
age                 0
menopause           0
tumor_size          0
inv_nodes           0
breast              0
metastasis          0
breast_quadrant     0
history             0
diagnosis_result    0
dtype: int64

In [34]:
df.head()

Unnamed: 0,year,age,menopause,tumor_size,inv_nodes,breast,metastasis,breast_quadrant,history,diagnosis_result
0,2019.0,40,1,2.0,0.0,Right,0,Upper inner,0,Benign
1,2019.0,39,1,2.0,0.0,Left,0,Upper outer,0,Benign
2,2019.0,45,0,4.0,0.0,Left,0,Lower outer,0,Benign
3,2019.0,26,1,3.0,0.0,Left,0,Lower inner,1,Benign
4,2019.0,21,1,1.0,0.0,Right,0,Upper outer,1,Benign


In [35]:
## convert the year column type to float
## df['metastasis'] = df['metastasis'].astype('int64')

In [36]:
## convert the year column type to float
## df['history'] = df['history'].astype('category')

In [37]:
## convert the year column type to float
## df['year'] = df['year'].astype('int64')

In [38]:
## convert the year column type to float
## df['menopause'] = df['menopause'].astype('category')

In [39]:
## convert the year column type to float
## df['breast'] = df['breast'].astype('category')

In [40]:
df.diagnosis_result.head()

0    Benign
1    Benign
2    Benign
3    Benign
4    Benign
Name: diagnosis_result, dtype: object

In [41]:
df.diagnosis_result.value_counts()

diagnosis_result
Benign       120
Malignant     93
Name: count, dtype: int64

In [42]:
df.diagnosis_result = (df.diagnosis_result == 'Malignant').astype(int)

In [43]:
df.diagnosis_result.head()

0    0
1    0
2    0
3    0
4    0
Name: diagnosis_result, dtype: int32

In [44]:
df.diagnosis_result.head()

0    0
1    0
2    0
3    0
4    0
Name: diagnosis_result, dtype: int32

In [45]:
df.head()

Unnamed: 0,year,age,menopause,tumor_size,inv_nodes,breast,metastasis,breast_quadrant,history,diagnosis_result
0,2019.0,40,1,2.0,0.0,Right,0,Upper inner,0,0
1,2019.0,39,1,2.0,0.0,Left,0,Upper outer,0,0
2,2019.0,45,0,4.0,0.0,Left,0,Lower outer,0,0
3,2019.0,26,1,3.0,0.0,Left,0,Lower inner,1,0
4,2019.0,21,1,1.0,0.0,Right,0,Upper outer,1,0


In [46]:
## saving the cleaned loan dataset
df.to_csv("Dataset/cleaned_breast_cancer_dataset.csv")