In [1]:
import numpy as np
import math
from sklearn.model_selection import KFold
import random

In [4]:
#Read the data
file=open('breast-cancer.data','r')
rows=file.read()
rows=rows.split('\n')
#deleting unwanted dat strings
rows.pop()
print(rows[0:10])
print('data has been loaded successfully')

['no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no', 'no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no', 'no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no', 'no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no', 'no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no', 'no-recurrence-events,60-69,ge40,15-19,0-2,no,2,left,left_low,no', 'no-recurrence-events,50-59,premeno,25-29,0-2,no,2,left,left_low,no', 'no-recurrence-events,60-69,ge40,20-24,0-2,no,1,left,left_low,no', 'no-recurrence-events,40-49,premeno,50-54,0-2,no,2,left,left_low,no', 'no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,left_up,no']
data has been loaded successfully


In [6]:
# initialising list for pre-defined values for indices
cls=['no-recurrence-events','recurrence-events']
age=['10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90-99']
menopause=['lt40', 'ge40', 'premeno']
size_of_tumer=['0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44','45-49', '50-54', '55-59']
inv_nodes=['0-2', '3-5', '6-8', '9-11', '12-14', '15-17', '18-20', '21-23', '24-26','27-29', '30-32', '33-35', '36-39']
node_caps=['yes','no']
breast=['left','right']
breast_quad=['left_up', 'left_low', 'right_up','right_low', 'central']

In [19]:
def data_preprocess(row):
    row_data=list(row.split(','))
    row_data[0]=cls.index(row_data[0])+1
    row_data[1]=age.index(row_data[1])+1
    row_data[2]=menopause.index(row_data[2])+1
    row_data[3]=size_of_tumer.index(row_data[3])+1
    row_data[4]=inv_nodes.index(row_data[4])+1
    
    if row_data[5] != '?':

        row_data[5]=node_caps.index(row_data[5])+1

    else:

        row_data[5]=0

    row_data[6]=int(row_data[6])

    row_data[7]=breast.index(row_data[7])+1

    if row_data[8] != '?':

        row_data[8]=breast_quad.index(row_data[8])+1

    else:

        row_data[8]=0

    row_data[9]=node_caps.index(row_data[9])+1
    return row_data

In [8]:
#preprocessing the data by row
dataset = []
for row in rows:
    dataset.append(data_preprocess(row))
dataset[0:10]

[[1, 3, 3, 7, 1, 2, 3, 1, 2, 2],
 [1, 4, 3, 5, 1, 2, 2, 2, 3, 2],
 [1, 4, 3, 5, 1, 2, 2, 1, 2, 2],
 [1, 6, 2, 4, 1, 2, 2, 2, 1, 2],
 [1, 4, 3, 1, 1, 2, 2, 2, 4, 2],
 [1, 6, 2, 4, 1, 2, 2, 1, 2, 2],
 [1, 5, 3, 6, 1, 2, 2, 1, 2, 2],
 [1, 6, 2, 5, 1, 2, 1, 1, 2, 2],
 [1, 4, 3, 11, 1, 2, 2, 1, 2, 2],
 [1, 4, 3, 5, 1, 2, 2, 2, 1, 2]]

In [9]:
#separate data by class
def separate_data_by_class(dataset):
    data_seg = {}
    for dt in dataset: 
        cls = dt[0]
        if cls not in data_seg:
            data_seg[cls] = []
        data_seg[cls].append(dt)
    return data_seg

In [21]:
# data summarization
def summarize_data(dataset):
    summary = [(np.mean(col),np.std(col),len(col)) for col in zip(*dataset)]
    del summary[0]
    return summary

In [11]:
# data summarizatioinstd by class
def summarizing_data_by_cls(dataset):
    data_seg = separate_data_by_class(dataset)
    data_summaries ={}
    for cls , row in data_seg.items():
        data_summaries[cls] = summarize_data(row)
    return data_summaries

In [12]:
# gaussian implementation
def gaussian(x,mean,std):
    try:
        gaussian=(1/(std*math.sqrt(2*math.pi)))*(math.exp(-(x-mean)**2/(2*std**2)))
        return gaussian
    except:
        return 0.01


In [13]:
# gaussian by class 
def gaussian_by_cls(data_summaries,row):
    probs={}
    no_of_rows= sum([data_summaries[x][0][2] for x in data_summaries])
    for cls,cls_summaries in data_summaries.items():
        probs[cls]= 1
        for i in range(len(cls_summaries)):
            probs[cls]+=math.log(gaussian(row[i],cls_summaries[i][0],cls_summaries[i][1]))
    return probs


In [14]:
# predicting cls row wise
def predict(cls_summaries, row):
    probs=gaussian_by_cls(cls_summaries,row)
    best_cls, best_prob = None ,-1
    for cls, prob in probs.items():
        if best_cls==None or prob>best_prob:
            best_cls,best_prob=cls,prob 
    return best_cls

In [15]:
#acuuracy of the value
def accuracy(actual,predict):
    crct_count= len([i for i,j in zip(actual,predict) if i==j])
    acc=crct_count/float(len(actual))*100
    return acc

In [16]:
#navie_bayes
def navie_bayes(train, test):
    cls_summaries = summarizing_data_by_cls(train)
    predictions=[]
    for r in test:
        predictions.append(predict(cls_summaries,row))
    return predictions
        

In [17]:
#K_folds
def K_fold_splits(dataset, no_of_splits):
    size_of_fold=int(len(dataset)/no_of_splits)
    copy_dataset= list(dataset)
    folds=[]
    for _ in range(no_of_splits):
        fold=[]
        while len(fold)<size_of_fold:
            r_idx= random.randrange(len(copy_dataset))
            fold.append(copy_dataset.pop(r_idx))
        folds.append(fold)
    return folds

In [20]:
print('dataset length:',len(dataset))
no_of_folds =10
print('In',no_of_folds,'folds test')
folds =K_fold_splits(dataset, no_of_folds)
accuracies =[]
for fold in folds:
    train = list(folds)
    train.remove (fold)
    train_set = []
    for idx in range(len(train)):
        train_set= train_set + train[idx]
    actual_set= []
    for row in fold:
        actual_set.append(row[-1])
    test_set=[]
    for row in fold:
        test_set.append(row)
    print('length of train data:',len(train_set))
    print('length of test data:',len(test_set))
    predict_set =  navie_bayes(train_set, test_set)
    accuracies.append(accuracy(actual_set,predict_set))
print('model_accuracy:', np.mean(accuracies))

dataset length: 286
In 10 folds test
length of train data: 252
length of test data: 28
length of train data: 252
length of test data: 28
length of train data: 252
length of test data: 28
length of train data: 252
length of test data: 28
length of train data: 252
length of test data: 28
length of train data: 252
length of test data: 28
length of train data: 252
length of test data: 28
length of train data: 252
length of test data: 28
length of train data: 252
length of test data: 28
length of train data: 252
length of test data: 28
model_accuracy: 76.42857142857142
