In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')

In [3]:
col_names=['class','age','menopause','tumor_size','inv_nodes','node-caps','deg-malig','breast','breaset-quad','target']
#missing_value_formats = ["n.a.","?","NA","n/a", "na", "--"]
df = pd.read_csv('breast-cancer.data',names=col_names,na_values = "?")


In [4]:
df

Unnamed: 0,class,age,menopause,tumor_size,inv_nodes,node-caps,deg-malig,breast,breaset-quad,target
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no
...,...,...,...,...,...,...,...,...,...,...
281,recurrence-events,30-39,premeno,30-34,0-2,no,2,left,left_up,no
282,recurrence-events,30-39,premeno,20-24,0-2,no,3,left,left_up,yes
283,recurrence-events,60-69,ge40,20-24,0-2,no,1,right,left_up,no
284,recurrence-events,40-49,ge40,30-34,3-5,no,3,left,left_low,no


# Handling Missing Values

In [5]:
#Checking for missing values
df.isnull().sum()

class           0
age             0
menopause       0
tumor_size      0
inv_nodes       0
node-caps       8
deg-malig       0
breast          0
breaset-quad    1
target          0
dtype: int64

In [6]:
#We use method = 'pad’ for taking values from the previous row.
df['node-caps'].fillna(method='pad',inplace=True)

In [7]:
#We use method = 'bfill’ for taking values from the next row.
df['breaset-quad'].fillna(method='bfill',inplace=True)

In [8]:
df.isnull().sum()

class           0
age             0
menopause       0
tumor_size      0
inv_nodes       0
node-caps       0
deg-malig       0
breast          0
breaset-quad    0
target          0
dtype: int64

In [9]:
categorical_features = [feature for feature in df.columns if df[feature].dtype=='O']
categorical_features

['class',
 'age',
 'menopause',
 'tumor_size',
 'inv_nodes',
 'node-caps',
 'breast',
 'breaset-quad',
 'target']

In [10]:
df['class'].unique()

array(['no-recurrence-events', 'recurrence-events'], dtype=object)

In [11]:
ordinal_features = [feature for feature in categorical_features if len(df[feature].unique())==2]
ordinal_features

['class', 'node-caps', 'breast', 'target']

In [12]:
df['breast'].unique()

array(['left', 'right'], dtype=object)

# Label Encoding


In [13]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for feature in ordinal_features:
    df[feature] = label_encoder.fit_transform(df[feature])
df
    

Unnamed: 0,class,age,menopause,tumor_size,inv_nodes,node-caps,deg-malig,breast,breaset-quad,target
0,0,30-39,premeno,30-34,0-2,0,3,0,left_low,0
1,0,40-49,premeno,20-24,0-2,0,2,1,right_up,0
2,0,40-49,premeno,20-24,0-2,0,2,0,left_low,0
3,0,60-69,ge40,15-19,0-2,0,2,1,left_up,0
4,0,40-49,premeno,0-4,0-2,0,2,1,right_low,0
...,...,...,...,...,...,...,...,...,...,...
281,1,30-39,premeno,30-34,0-2,0,2,0,left_up,0
282,1,30-39,premeno,20-24,0-2,0,3,0,left_up,1
283,1,60-69,ge40,20-24,0-2,0,1,1,left_up,0
284,1,40-49,ge40,30-34,3-5,0,3,0,left_low,0


# One Hot Encoding

In [16]:

for feature in categorical_features:
    if feature not in ordinal_features:
        dummy = pd.get_dummies(df[feature],drop_first=True)
        df = pd.concat([df,dummy],axis=1)
        dummies=[]
df.head()

Unnamed: 0,class,age,menopause,tumor_size,inv_nodes,node-caps,deg-malig,breast,breaset-quad,target,...,12-14,15-17,24-26,3-5,6-8,9-11,left_low,left_up,right_low,right_up
0,0,30-39,premeno,30-34,0-2,0,3,0,left_low,0,...,0,0,0,0,0,0,1,0,0,0
1,0,40-49,premeno,20-24,0-2,0,2,1,right_up,0,...,0,0,0,0,0,0,0,0,0,1
2,0,40-49,premeno,20-24,0-2,0,2,0,left_low,0,...,0,0,0,0,0,0,1,0,0,0
3,0,60-69,ge40,15-19,0-2,0,2,1,left_up,0,...,0,0,0,0,0,0,0,1,0,0
4,0,40-49,premeno,0-4,0-2,0,2,1,right_low,0,...,0,0,0,0,0,0,0,0,1,0


In [17]:
for feature in categorical_features:
    if feature not in ordinal_features:
        df = df.drop([feature],axis=1)
df.head()


Unnamed: 0,class,node-caps,deg-malig,breast,target,30-39,40-49,50-59,60-69,70-79,...,12-14,15-17,24-26,3-5,6-8,9-11,left_low,left_up,right_low,right_up
0,0,0,3,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,2,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,2,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,2,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,2,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [18]:
y=df['class']
X=df.drop('class',axis=1)

In [19]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.2,random_state=0)


In [20]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, 
                               bootstrap = True,
                               max_features = 'sqrt')
rf_model.fit(X_train,y_train)
y_pred=rf_model.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print ("Accuracy : ", 
    accuracy_score(y_test,y_pred)*100) 


[[36  2]
 [12  8]]
              precision    recall  f1-score   support

           0       0.75      0.95      0.84        38
           1       0.80      0.40      0.53        20

    accuracy                           0.76        58
   macro avg       0.78      0.67      0.69        58
weighted avg       0.77      0.76      0.73        58

Accuracy :  75.86206896551724
