In [1]:
# Standard libraries we always include
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns; sns.set(style="ticks", color_codes=True)

# Locate and load the data file
df = pd.read_csv('breast_cancer_raw.csv')
print(f'#rows={len(df)} #columns={len(df.columns)}')

# Print some info and plots to have a feeling about the dataset
print(df.dtypes)

#rows=298 #columns=10
age            float64
menopause       object
tumor-size     float64
inv-nodes      float64
node-caps       object
deg-malig        int64
breast          object
breast-quad     object
irradiat        object
recurrence      object
dtype: object


In [3]:
# Check for duplicates, this adds a new column to the dataset
df["is_duplicate"]= df.duplicated()

# Drop the duplicate rows using index - best way to drop in pandas
index_to_drop = df[df['is_duplicate']==True].index
df.drop(index_to_drop, inplace=True)

# Remove the duplicate marker column
df.drop(columns='is_duplicate', inplace=True)
print(f'#total= {len(df)}')

#total= 293


In [4]:
# Impute
df['age'] = df['age'].fillna(df['age'].mean())
df['tumor-size'] = df['tumor-size'].fillna(df['tumor-size'].mean())
df['inv-nodes'] = df['inv-nodes'].fillna(df['inv-nodes'].mean())

In [5]:
# Replace '?' with mode - value/level with highest frequency in the feature
df['node-caps'] = df['node-caps'].replace({'?':'no'})
df['breast-quad'] = df['breast-quad'].replace({'?':'left_low'})

In [6]:
# Remove that line with the incorrect age=250 and age=-5
display(df[df['age']==250])
index_to_drop = df[df['age']==250].index
df.drop(index_to_drop, inplace=True)
index_to_drop = df[df['age']==-5].index
df.drop(index_to_drop, inplace=True)

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,recurrence
10,250.0,premeno,30.0,3.0,no,2,left,right_low,yes,no-recurrence-events


In [7]:
# Let's reset the indices to the dataframe after dropping a few rows
df = df.reset_index(drop=True)

In [8]:
# pandas get_dummies function is the one-hot-encoder
def encode_onehot(_df, f):
    _df2 = pd.get_dummies(_df[f], prefix='', prefix_sep='').max(level=0, axis=1).add_prefix(f+' - ')
    df3 = pd.concat([_df, _df2], axis=1)
    df3 = df3.drop([f], axis=1)
    return df3

In [9]:
# Apply the onehot-encoding method
df_o = encode_onehot(df, 'menopause')

In [10]:
# Apply the rest of the nominal features too
df_o = encode_onehot(df_o, 'node-caps')
df_o = encode_onehot(df_o, 'breast')
df_o = encode_onehot(df_o, 'breast-quad')
df_o = encode_onehot(df_o, 'irradiat')

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split

# We will reuse the classifier function below
def rf_train_test(_X_tr, _X_ts, _y_tr, _y_ts):
    # Create a new random forest classifier, with working 4 parallel cores
    rf = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=None, n_jobs=4)
    # Train on training data
    model = rf.fit(_X_tr, _y_tr)
    # Test on training data
    y_pred = rf.predict(_X_ts)
    # Return accuracy
    return accuracy_score(_y_ts, y_pred)

In [12]:
# Prepare the input X matrix and target y vector
X = df_o.loc[:, df_o.columns != 'recurrence'].values
y = df_o.loc[:, df_o.columns == 'recurrence'].values.ravel()

In [13]:
# 80% split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=None)
rf_train_test(X_train, X_test, y_train, y_test)

0.7966101694915254

In [14]:
# Run 10 times
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=None)
    print(rf_train_test(X_train, X_test, y_train, y_test))

0.7796610169491526
0.711864406779661
0.6949152542372882
0.7627118644067796
0.8135593220338984
0.7457627118644068
0.6271186440677966
0.7288135593220338
0.8135593220338984
0.711864406779661


In [15]:
%%time
# Run 100 times and collect statistics
accuracies = []
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=None)
    accuracies += [rf_train_test(X_train, X_test, y_train, y_test)]

print(f'80% train-test split accuracy is {np.mean(accuracies):.3f} {chr(177)}{np.std(accuracies):.4f}')

80% train-test split accuracy is 0.736 ±0.0476
Wall time: 25.1 s


In [16]:
%%time
# 10-fold cross validation
accuracies = []
kf = KFold(n_splits=10,shuffle=False,random_state=None)
for train_index, test_index in kf.split(X, y):
    acc = rf_train_test(X[train_index], X[test_index], y[train_index], y[test_index])
    accuracies += [acc]

print(f'10-fold cross validation accuracy is {np.mean(accuracies):.3f} {chr(177)}{np.std(accuracies):.4f}')

10-fold cross validation accuracy is 0.732 ±0.0726
Wall time: 2.51 s


In [19]:
%%time
# 3-fold cross validation
accuracies = []
kf = KFold(n_splits=3,shuffle=False,random_state=None)
for train_index, test_index in kf.split(X, y):
    acc = rf_train_test(X[train_index], X[test_index], y[train_index], y[test_index])
    accuracies += [acc]

print(f'3-fold cross validation accuracy is {np.mean(accuracies):.3f} {chr(177)}{np.std(accuracies):.4f}')

3-fold cross validation accuracy is 0.729 ±0.0212
Wall time: 746 ms


In [17]:
%%time
def eval_classifier(X, y, niter):
    accuracies = []
    kf = StratifiedKFold(n_splits=10,shuffle=False,random_state=None)
    for train_index, test_index in kf.split(X, y):
        acc = rf_train_test(X[train_index], X[test_index], y[train_index], y[test_index])
        accuracies += [acc]

    print( (f'Stratified 10-fold cross validation accuracy is '
            f'{np.mean(accuracies):.3f} {chr(177)}{np.std(accuracies):.4f} with {niter} total iterations')
         )

eval_classifier(X, y, 100)

Stratified 10-fold cross validation accuracy is 0.731 ±0.0495 with 100 total iterations
Wall time: 2.5 s
