In [1]:
import numpy as np
import tensorflow as tf
from sklearn import preprocessing as p
import pandas as pd

In [2]:
data_source = "https://archive.ics.uci.edu/ml/machine-learning-databases/restricted/breast-cancer/breast-cancer.data"

names = ["class", "age", "menopause", "tumor-size", "inv-nodes", "node-caps", "deg-malig", "breast", "breast-quad", "irradiat"]

df = pd.read_csv(data_source, names=names)
df.dtypes

class          object
age            object
menopause      object
tumor-size     object
inv-nodes      object
node-caps      object
deg-malig       int64
breast         object
breast-quad    object
irradiat       object
dtype: object

In [3]:
df[:5]

Unnamed: 0,class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [15]:
df_category_only = df.drop(["irradiat"] , axis=1)
df_category_only[:5]

Unnamed: 0,class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low


In [17]:
for col in ["class", "age", "menopause", "tumor-size", "inv-nodes", "node-caps", "breast", "breast-quad"]:
    df_category_only[col] = df_category_only[col].astype('category')
df_category_only.dtypes

class          category
age            category
menopause      category
tumor-size     category
inv-nodes      category
node-caps      category
deg-malig         int64
breast         category
breast-quad    category
dtype: object

In [18]:
df_category_only = df_category_only.dropna()

In [19]:
df_dummies = pd.get_dummies(df_category_only, drop_first = True)
df_dummies[:5]

Unnamed: 0,deg-malig,class_recurrence-events,age_30-39,age_40-49,age_50-59,age_60-69,age_70-79,menopause_lt40,menopause_premeno,tumor-size_10-14,...,inv-nodes_6-8,inv-nodes_9-11,node-caps_no,node-caps_yes,breast_right,breast-quad_central,breast-quad_left_low,breast-quad_left_up,breast-quad_right_low,breast-quad_right_up
0,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [20]:
df_dummies.shape

(286, 33)

In [21]:
x_data = df_dummies.ix[:, 1:].values
y_data = df_dummies.ix[:, 0].reshape(-1,1)

y_data.shape, x_data.shape

((286, 1), (286, 32))

In [22]:
def train_navie_bayes_possiblities(x_data, y_data):
    num_total_case = len(x_data)
    num_attributes = len(x_data[0])
    num_class_series =  np.unique(y_data, return_counts=True)
    num_class_possbilities = num_class_series[1] / num_total_case
    
    case_indexes = []
    for case in num_class_series[0]:
        case_indexes.append(np.where(y_data == case)[0])
        
    possbiliteis_vector = []
    for i, case_index in enumerate(case_indexes):
        possbiliteis_vector.append( (np.sum(x_data[case_index], axis=0)+1  ) / (num_class_series[1][i]+2) )
    return num_class_possbilities, possbiliteis_vector

def infsum(data, **args):
    return np.ma.filled(np.ma.masked_array(data,np.isinf(data)).sum(**args), fill_value=np.nan)

In [23]:
from sklearn.cross_validation import KFold
kf = KFold(len(y_data), n_folds=2)

for train_index, test_index in kf:
    x_train, x_test = x_data[train_index], x_data[test_index]
    y_train, y_test = y_data[train_index], y_data[test_index]
    
    num_class_possbilities, possbiliteis_vector = train_navie_bayes_possiblities(x_train,y_train)

    expected_possbilites = []
 
    for i, case in enumerate(num_class_possbilities): 
        sum_vector = np.log(x_test * possbiliteis_vector[i])
        sum_possbilities = infsum(sum_vector, axis=1)
        expected_possbilites.append(sum_possbilities)
    
    clasfiy_vector = np.argmax(np.array(expected_possbilites), axis=0)
    print(sum(clasfiy_vector == y_data[test_index].ravel())/ len(y_data[test_index].ravel()))

0.181818181818
0.293706293706


In [24]:
from sklearn import linear_model, datasets


for train_index, test_index in kf:
    x_train, x_test = x_data[train_index], x_data[test_index]
    y_train, y_test = y_data[train_index], y_data[test_index]

    logreg = linear_model.LogisticRegression(C=1e5)
    logreg.fit(x_train, y_train.ravel())
    
    print(sum(logreg.predict(x_test) == y_test.ravel()) / y_data.shape[0])

0.202797202797
0.160839160839
