# Classification of malignant and benign tumor using LogisticRegression

In [32]:
import pandas as pd
import numpy as np
from mllab.LogisticRegression import LogisticRegression
from mllab.preprocessing import normalize as nm
import seaborn as sn
import matplotlib.pyplot as plt
# or can be used as 
#from supertring import preprocessing as pp
#pre.normalize(inputX)

In [2]:
cancer = pd.read_csv('datasets/logistic_regression_cancer.csv')

In [3]:
cancer.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [4]:
cancer["diagnosis"] = cancer["diagnosis"].astype('category')

In [5]:
cancer['diagnosis'] = cancer['diagnosis'].cat.codes

In [6]:
cancer = cancer.drop(['id','Unnamed: 32'], axis=1)

In [7]:
cancer.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [8]:
X = cancer.drop(['diagnosis'], axis=1)
Y = pd.DataFrame(cancer['diagnosis'], columns = ['diagnosis'])

__Normalization of training datasets__

In [9]:
X_norm = nm(X)

In [10]:
#split training and test samples
#training datasets
trainx = X_norm[:400]
trainy = Y[:400]

#test datasets
testx = X_norm[400:]
testy = Y[400:]

In [47]:
testx.shape

(169, 30)

In [11]:
#initialize LogisticRegression
logreg = LogisticRegression()

__Initialize optimizer, loss function and activation function for the algorithm__
* This is optional
* If not initialized, gradient_descent, binary_ccross_entropy and sigmoid are the default parameter

In [12]:
logreg.compile('gradient_descent','binary_cross_entropy','sigmoid')

__Training input x and y are mandatory__
* __optional paramter__
    * weights.........: default : np.array([np.zeros(int)]), 
    * bias...............: default : 0
    * lr....................: default : 1e-3
    * batch_size....: default : 1
    * epoches........: default : 100
    

* __returns:  weight, loss_history__

In [13]:
weight, loss_history = logreg.train(trainx, trainy, weights = np.array([np.zeros(30)]), 
                                    bias=0, lr=1e-5, batch_size=1, epoches=1000)

__Infer takes testx and testy__

* __returns: predicted values__

In [25]:
pred_y = logreg.infer(testx, testy)

In [26]:
from sklearn import metrics
print('Accuracy: %2.2f %%' % (100. * metrics.accuracy_score(testy, pred_y)))

Accuracy: 97.63 %


__Confusion Matrix__

In [43]:
testy = np.array(testy).reshape(-1)
predy = np.array(pred_y).reshape(-1)

In [1]:
data = {'actual' : testy,
        'pred' : predy}

df = pd.DataFrame(data, columns=['actual', 'pred'])
confusion_matrix = pd.crosstab(df['actual'], df['pred'], rownames=['actual'], colnames=['predicted'], margins=True)

sn.heatmap(confusion_matrix, cmp=True)
plt.show()

NameError: name 'testy' is not defined

In [46]:
fp = 0
fn = 0

tp = 0
tn = 0

for actual_value, predicted_value in zip(testy, predy):
    # let's first see if it's a true (t) or false prediction (f)
    if predicted_value == actual_value: # t?
        if predicted_value == 1: # tp
            tp += 1
        else: # tn
            tn += 1
    else: # f?
        if predicted_value == 1: # fp
            fp += 1
        else: # fn
            fn += 1
            
our_confusion_matrix = [
    [tn, fp],
    [fn, tp]
]
# we convert it to numpy array to be printed properly as a matrix

our_confusion_matrix = np.array(our_confusion_matrix)
our_confusion_matrix

array([[126,   4],
       [  0,  39]])

In [48]:
import numpy as np

currentDataClass = [1, 3, 3, 2, 5, 5, 3, 2, 1, 4, 3, 2, 1, 1, 2]
predictedClass = [1, 2, 3, 4, 2, 3, 3, 2, 1, 2, 3, 1, 5, 1, 1]

def comp_confmat(actual, predicted):

    classes = np.unique(actual) # extract the different classes
    matrix = np.zeros((len(classes), len(classes))) # initialize the confusion matrix with zeros

    for i in range(len(classes)):
        for j in range(len(classes)):

            matrix[i, j] = np.sum((actual == classes[i]) & (predicted == classes[j]))

    return matrix

comp_confmat(currentDataClass, predictedClass)




array([[3., 0., 0., 0., 1.],
       [2., 1., 0., 1., 0.],
       [0., 1., 3., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 1., 0., 0.]])