In [17]:
import sys

In [6]:
import numpy as np
import pandas as pd

def load_data():    
            
    data = pd.read_csv('..\\Breast Cancer dataset\Breast_Cancer_dataset.txt', header=None)
    
    # retrieve numpy array
    dataset = data.values
        
    for i in dataset:
        for j, item in enumerate(i):
            i[j] = item.replace("'", "")


    X = dataset[:, :-1]
    Y = dataset[:,-1]
    
    X = X.astype(str)
  

    return X, Y


In [7]:
from tensorflow.contrib.learn.python.learn.estimators._sklearn import train_test_split

# load the dataset
X, Y = load_data()
# split into train and test sets
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.30, random_state=1)


In [8]:
def features():
    with open('..\\Breast Cancer dataset\\breast_cancer', 'r', encoding="utf-8") as filehandle:
        breastCancerData = filehandle.readlines()
    
    features = []

    for line in breastCancerData:
        if '@attribute' in line:
            line = line.lstrip('@attribute').split()
            features.append(line[0])
                        
    features[-2] = features[-2].replace("'","")
            
    features = features[:-1]
    return features


In [9]:
# prepare input data
def prepare_inputs(xtrain, xtest):

    #convert to pd dataframes
    xtrain = pd.DataFrame(xtrain,index=None)
    xtest = pd.DataFrame(xtest, index=None)
    
    xtrain.columns = features()
    xtest.columns = features()
    
    
    all_data = pd.concat((xtrain,xtest))
    #for column in all_data.select_dtypes(include=[np.object]).columns:
        #print(column, all_data[column].unique())
        
    
    for column in all_data.select_dtypes(include=[np.object]).columns:
        xtrain[column] = xtrain[column].astype('category', categories = all_data[column].unique())
        xtest[column] = xtest[column].astype('category', categories = all_data[column].unique())
    

    xtrain_enc = pd.get_dummies(xtrain,columns=features())
    xtest_enc = pd.get_dummies(xtest,columns=features())
    
    #convert back to np array
    
    xtrain_enc = xtrain_enc.to_numpy()
    xtest_enc = xtest_enc.to_numpy()
    
    xtest_enc = np.reshape(xtest_enc,(xtest_enc.shape[0],xtrain_enc.shape[-1]))    

    return xtrain_enc, xtest_enc




In [10]:
# prepare target
def prepare_targets(ytrain, ytest):
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    le.fit(ytrain)
    ytrain_enc = le.transform(ytrain)
    ytest_enc = le.transform(ytest)

    return ytrain_enc, ytest_enc

In [11]:
# prepare input data
xtrain_enc, xtest_enc = prepare_inputs(xtrain, xtest)
# prepare output data
ytrain_enc, ytest_enc = prepare_targets(ytrain, ytest)

  


In [12]:
class nn:
    def __init__(self, xtrain, ytrain, xtest, ytest, hidNodes=12, lrRate=0.1, epochs=500):
        self.ytrain = ytrain
        self.xtrain = xtrain

        self.ytest = ytest
        self.xtest = xtest
        
        # params
        np.random.seed(4)
        self.input_nodes = len(xtrain[0])
        self.hid_nodes = hidNodes
        self.output_nodes = self.ytrain.shape[0]
        self.lrRate = lrRate

        # init weights
        self.w1 = 2 * np.random.random((self.input_nodes, self.hid_nodes)) -1
        self.w2 = 2 * np.random.random((self.hid_nodes, self.output_nodes)) - 1

        self.train(epochs)
        self.test()
        
    #activation func

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def dsigmoid(self, x):
        return x * (1 - x)
    
        
    def train(self, epochs):
        for e in range(epochs):
            hid_layer = self.sigmoid(np.dot(self.xtrain, self.w1))
            out_layer = self.sigmoid(np.dot(hid_layer, self.w2))

            #backpropagation
            err = self.ytrain - out_layer

            out_layer_delta = err * self.dsigmoid(out_layer)
            hid_layer_delta = out_layer.dot(self.w2.T) * self.dsigmoid(hid_layer)

            #update weights
            self.w2 = np.add(self.w2, hid_layer.T.dot(out_layer_delta)) * self.lrRate
        print('training error: ', abs(err).mean())

    
    def test(self):
        
        correct = 0
        pred_list = []

        
        hid_layer = self.sigmoid(np.dot(self.xtest, self.w1))
        out_layer = self.sigmoid(np.dot(hid_layer, self.w2))
        
        
        for i in range(len(out_layer)):            
        
            if out_layer[i].mean() >= 0.5: #np.full(out_layer[i].shape,0.5):
                pred = 1
            else: 
                pred = 0
            
            if pred == self.ytest[i]:
                correct += 1
                
            pred_list.append(pred)
            
        print('accuracy: ',correct/len(ytest))
                
            
    #TODO : plot the data an result

In [16]:
#n = nn(xtrain_enc,ytrain_enc,xtest_enc,ytest_enc,epochs = 500)