In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import edward as ed
import numpy as np
import pandas as pd
import tensorflow as tf

from edward.models import Bernoulli, MultivariateNormalTriL, Normal
from edward.util import rbf

In [2]:
ed.set_seed(42)

In [3]:
data=pd.read_csv("C:\\Users\\Rhea\\Documents\\Bayesian Modeling\\Datasets\\BreastCancerWisconsin\\breast-cancer-wisconsin.data",
names=['id','diagnosis','radius_mean','texture_mean','perimeter_mean','area_mean','smoothness_mean','compactness_mean','concavity_mean',
       'concave_points_mean','symmetry_mean','fractal_dimension_mean','radius_se','texture_se','perimeter_se','area_se','smoothness_se',
       'compactness_se','concavity_se','concave_points_se','symmetry_se','fractal_dimension_se','radius_worst','texture_worst',
       'perimeter_worst','area_worst','smoothness_worst','compactness_worst','concavity_worst','concave_points_worst',
       'symmetry_worst','fractal_dimension_worst'])

In [4]:
data.shape

(569, 32)

In [5]:
data['diagnosis'].value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

#### Convert relevant columns to categorical

In [6]:
def change_to_category(df):
    i=0;
    while i<df.columns.size:
        if ((df.dtypes[df.columns[i]] == 'object' or df.dtypes[df.columns[i]] == 'bool')):
            col = df.dtypes.index[i]
            df[col] = df[col].astype('category')
        i = i+1 

#### Generate X and y variables

In [7]:
def generateXY(df,target_col,var_list):
    X=df[varToUse]
    y=df[target_col]
    return X,y

Since we aim to classify patients as having Malignant and Benign tumor, we decide **'diagnosis'** to be our target variable.
<br>
Also, we replace 'M' with 1 and 'B' with 0, in the column

In [8]:
# data['diagnosis'] = data['diagnosis'].replace('M', 1)
# data['diagnosis'] = data['diagnosis'].replace('B', 0)
data['diagnosis'] = data['diagnosis'].map({'M':1,'B':0})

In [9]:
# Convert columns with binary values to categorical
change_to_category(data)

In [10]:
data.diagnosis.dtype

dtype('int64')

In [11]:
data.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave_points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

In [12]:
#drop irrelevant columns
data.drop(['id'], axis = 1, inplace = True)

In [42]:
data2 = pd.get_dummies(data)
data2.shape

(569, 31)

In [63]:
data_y[N:].shape

(169, 1)

In [73]:
data_x = data2.iloc[0:,1:].as_matrix().astype(np.float32)
data_y = data2.iloc[0:,0:1].as_matrix().astype(np.float32)

N = 300
train_x, test_x = data_x[:N], data_x[N:]
train_y, test_y = data_y[:N], data_y[N:]

in_size = train_x.shape[1]
out_size = train_y.shape[1]

EPOCH_NUM = 3

# for bayesian neural network
train_y2 = np.argmax(train_y, axis=1)
test_y2 = np.argmax(test_y, axis=1)

In [74]:
import sys
from tqdm import tqdm
import tensorflow as tf

BATCH_SIZE=300

x_ = tf.placeholder(tf.float32, shape=[None, in_size])
y_ = tf.placeholder(tf.float32, shape=[None, out_size])

w = tf.Variable(tf.truncated_normal([in_size, out_size], stddev=0.1), dtype=tf.float32)
b = tf.Variable(tf.constant(0.1, shape=[out_size]), dtype=tf.float32)
y_pre = tf.matmul(x_, w) + b

cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_pre))
train_step = tf.train.AdamOptimizer().minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_pre, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

sess = tf.Session()
sess.run(tf.global_variables_initializer())
for epoch in tqdm(range(EPOCH_NUM), file=sys.stdout):
    perm = np.random.permutation(N)
    for i in range(0, N, BATCH_SIZE):
        batch_x = train_x[perm[i:i+BATCH_SIZE]]
        batch_y = train_y[perm[i:i+BATCH_SIZE]]
        train_step.run(session=sess, feed_dict={x_: batch_x, y_: batch_y})
    acc = accuracy.eval(session=sess, feed_dict={x_: train_x, y_: train_y})
    test_acc = accuracy.eval(session=sess, feed_dict={x_: test_x, y_: test_y})
    if (epoch+1) % 1 == 0:
        tqdm.write('epoch:\t{}\taccuracy:\t{}\tvaridation accuracy:\t{}'.format(epoch+1, acc, test_acc))

epoch:	1	accuracy:	1.0	varidation accuracy:	1.0                                                                        
epoch:	2	accuracy:	1.0	varidation accuracy:	1.0                                                                        
epoch:	3	accuracy:	1.0	varidation accuracy:	1.0                                                                        
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 30.13it/s]


In [76]:
import edward as ed
from edward.models import Normal, Categorical

x_ = tf.placeholder(tf.float32, shape=(None, in_size))
y_ = tf.placeholder(tf.int32, shape=(BATCH_SIZE))

w = Normal(loc=tf.zeros([in_size, out_size]), scale=tf.ones([in_size, out_size]))
b = Normal(loc=tf.zeros([out_size]), scale=tf.ones([out_size]))
y_pre = Categorical(tf.matmul(x_, w) + b)

qw = Normal(loc=tf.Variable(tf.random_normal([in_size, out_size])), scale=tf.Variable(tf.random_normal([in_size, out_size])))
qb = Normal(loc=tf.Variable(tf.random_normal([out_size])), scale=tf.Variable(tf.random_normal([out_size])))

y = Categorical(tf.matmul(x_, qw) + qb)

inference = ed.KLqp({w: qw, b: qb}, data={y_pre: y_})
inference.initialize()

sess = tf.Session()
sess.run(tf.global_variables_initializer())

with sess:
    samples_num = 300
    for epoch in tqdm(range(EPOCH_NUM), file=sys.stdout):
        perm = np.random.permutation(N)
        for i in range(0, N, BATCH_SIZE):
            batch_x = train_x[perm[i:i+BATCH_SIZE]]
            batch_y = train_y2[perm[i:i+BATCH_SIZE]]
            inference.update(feed_dict={x_: batch_x, y_: batch_y})
        y_samples = y.sample(samples_num).eval(feed_dict={x_: train_x})
        acc = (np.round(y_samples.sum(axis=0) / samples_num) == train_y2).mean()
        y_samples = y.sample(samples_num).eval(feed_dict={x_: test_x})
        test_acc = (np.round(y_samples.sum(axis=0) / samples_num) == test_y2).mean()
        if (epoch+1) % 1 == 0:
            tqdm.write('epoch:\t{}\taccuracy:\t{}\tvaridation accuracy:\t{}'.format(epoch+1, acc, test_acc))

epoch:	1	accuracy:	1.0	varidation accuracy:	1.0                                                                        
epoch:	2	accuracy:	1.0	varidation accuracy:	1.0                                                                        
epoch:	3	accuracy:	1.0	varidation accuracy:	1.0                                                                        
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  3.52it/s]
