In [1]:
import pandas as pd
import numpy as np
import scipy.stats as s

In [2]:
# Taking data into pandas dataframe.
data = pd.read_csv("tumorsdata.csv")

In [3]:
data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [4]:
# Storing names of columns in "column_names" variable.
column_names = data.columns

In [5]:
# Dropping columns which we will not need
data.drop([column_names[0],column_names[32]],axis=1,inplace=True)

In [6]:
# Taking class labels into "labels" variable.
labels = np.array(data['diagnosis']).reshape(data['diagnosis'].shape[0],1)

In [7]:
# storing data in the form of array.
x = np.array(data.iloc[:,1:])

# Applying Principle Component Analysis

In [8]:
# Claculating Mean of the data.
mu = np.mean(x,axis=0)

In [9]:
mu = mu.reshape(-1,mu.shape[0])

In [10]:
mu.shape

(1, 30)

In [11]:
# Making data Zero Mean.
x_dash = x-mu

In [12]:
x_dash.shape

(569, 30)

In [13]:
sigma_hat = (1/data.shape[0])*np.matmul(x_dash.T,x_dash)

In [14]:
sigma_hat.shape

(30, 30)

In [15]:
# Appling Singular Value Decomposition
sigma_hat_decomposed = np.linalg.svd(sigma_hat)

In [16]:
len(sigma_hat_decomposed)

3

In [17]:
# Q is storing the data of eigen values
Q = sigma_hat_decomposed[0]

In [18]:
# lamda has eigen vectors.
lamda = sigma_hat_decomposed[1]

In [19]:
# Q_tilda has the number of features we want to take.
Q_tilda = Q[:,0:15]

In [20]:
# x_new is our new projected data.
x_new = np.matmul(x_dash,Q_tilda)

In [21]:
x_new.shape

(569, 15)

In [22]:
# converting back to pandas dataframe
new_data = pd.DataFrame(x_new)

In [23]:
# Attaching labels
new_data['diagnosis'] = labels

In [24]:
new_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,diagnosis
0,-1160.142574,-293.917544,48.578398,8.711975,32.000486,-1.265415,0.931337,-0.148167,-0.745463,0.589359,0.307804,-0.043452,0.034777,-0.065069,0.012934,M
1,-1269.122443,15.630182,-35.394534,-17.861283,-4.334874,0.225872,-0.046037,-0.200804,0.485828,-0.084035,-0.080642,-0.033042,0.045485,0.005534,-0.021368,M
2,-995.793889,39.156743,-1.709753,-4.199340,-0.466529,2.652811,-0.779745,0.274026,0.173874,-0.186994,-0.279174,0.020464,0.083505,-0.024824,0.026887,M
3,407.180803,-67.380320,8.672848,11.759867,7.115461,-1.299436,-1.267304,0.060555,0.330639,-0.144155,-0.927471,0.174720,0.282556,-0.080057,-0.043201,M
4,-930.341180,189.340742,1.374801,-8.499183,7.613289,-1.021160,-0.335522,-0.289109,-0.036087,-0.138502,-0.042228,0.062721,-0.114247,-0.002274,0.019548,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,-1414.126684,110.222492,40.065944,-6.562240,-5.102856,0.395424,-0.786751,-0.037082,0.452530,-0.235185,-0.163649,-0.052543,-0.075032,0.015211,0.061390,M
565,-1045.018854,77.057589,0.036669,4.753245,-12.417863,0.059637,0.449831,-0.509154,0.449986,0.493247,-0.007625,-0.055832,-0.015163,-0.009985,-0.003312,M
566,-314.501756,47.553525,-10.442407,9.771881,-6.156213,0.870726,-2.166493,0.442279,0.097398,-0.144667,0.109147,-0.076263,-0.004448,0.055285,0.012459,M
567,-1124.858115,34.129225,-19.742087,23.660881,3.565133,-4.086390,-1.705401,0.359964,-0.385030,0.615467,-0.307166,0.028224,0.060561,0.037742,0.031873,M


# Seperating Data into Training, Cross Validation and Testing data

In [25]:
# Number of Benign Tumors in the data
new_data[new_data['diagnosis'] == 'B'].shape

(357, 16)

In [26]:
# Number of Malignant Tumors in the data
new_data[new_data['diagnosis'] == 'M'].shape

(212, 16)

In [27]:
new_data['diagnosis'].replace(to_replace='M',value=1,inplace=True)

In [28]:
new_data['diagnosis'].replace(to_replace='B',value=0,inplace=True)

In [29]:
# Taking 70% of data as training data
training_data_len = int(0.7*new_data.shape[0])

In [30]:
training_data_len

398

In [31]:
benign_tumor_training_data = new_data[new_data['diagnosis'] == 0].iloc[0:training_data_len//2]

In [32]:
malignant_tumor_training_data = new_data[new_data['diagnosis'] == 1].iloc[0:training_data_len//2]

In [33]:
benign_tumor_training_data.shape

(199, 16)

In [34]:
training_data = pd.concat([benign_tumor_training_data,malignant_tumor_training_data])

In [35]:
# Taking 205 of data as cross validation data and 10% as Testing Data
cv_data_len = int(0.2*data.shape[0])

In [36]:
cv_data_len

113

In [37]:
benign_remaining_data = new_data[new_data['diagnosis'] == 0].iloc[training_data_len//2:]

In [38]:
malignant_remaining_data = new_data[new_data['diagnosis'] == 1].iloc[training_data_len//2:]

In [39]:
remaining_data = pd.concat([benign_remaining_data,malignant_remaining_data])

In [40]:
remaining_data.shape

(171, 16)

In [41]:
cross_validation_data = remaining_data.iloc[0:cv_data_len]
testing_data = remaining_data.iloc[cv_data_len:]

In [42]:
cross_validation_data.shape

(113, 16)

In [43]:
testing_data.shape

(58, 16)

In [44]:
training_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,diagnosis
19,191.621045,12.259178,-6.241020,-3.588034,6.692721,0.037497,0.336992,0.159502,-0.013716,0.010386,0.078203,-0.036511,-0.029271,-0.039346,-0.017123,0
20,285.051275,14.557362,-10.177971,-1.483015,5.828780,0.790474,0.080541,0.454472,-0.268240,0.092501,0.032368,0.122332,0.037006,0.000394,-0.017677,0
21,683.583952,-32.576126,15.095343,-12.576780,1.707262,-1.104473,-0.309995,-0.328012,-0.135464,-0.232627,-0.039800,0.013665,-0.016789,0.007654,0.010671,0
37,355.877850,61.535566,-6.242785,-4.970090,-3.076071,1.750950,-0.091251,-0.511769,-0.102631,1.011803,0.024114,-0.059378,-0.000325,0.018064,-0.035210,0
46,783.838668,-56.826556,13.657869,-10.929661,-7.112941,-2.742609,-0.747497,0.247352,-0.345514,-0.233113,-0.080757,0.095598,0.009075,-0.027129,-0.006755,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499,-1099.486889,112.597421,-27.476067,-0.145937,-2.458353,-1.537728,1.274740,-0.224613,-0.469098,-0.102762,0.047910,0.093248,-0.100222,0.029009,0.056247,1
501,109.464701,-2.229269,3.854344,10.871600,-3.023712,2.215435,0.365275,0.020654,-0.201790,-0.171657,-0.100881,0.175142,0.032250,-0.029611,0.042311,1
503,-2166.185165,-108.422615,21.170655,-9.870265,4.600230,-6.029129,-3.231749,-0.385159,0.459618,0.086349,0.186573,-0.070107,0.065125,0.036807,0.019955,1
509,-64.596512,50.650874,-13.205191,13.519044,-3.370923,1.640083,2.685771,-0.600741,-0.693918,-0.225630,0.032308,-0.042296,-0.018556,0.033451,0.003069,1


In [45]:
# Taking mean of the malignant tumors training data.
mu_hat_m = np.array(training_data[training_data['diagnosis'] == 1].iloc[:,0:15].mean())

In [46]:
# Taking mean of the malignant tumors covariance data.
sigma_hat_m = np.array(training_data[training_data['diagnosis'] == 1].iloc[:,0:15].cov())

In [47]:
np.linalg.det(sigma_hat_m)

19723.271206449655

In [48]:
sigma_hat_m.diagonal()

array([4.83251406e+05, 1.80244546e+04, 1.71425537e+03, 7.92821249e+01,
       4.99687724e+01, 4.96625902e+00, 2.33253962e+00, 5.72687305e-01,
       3.00085039e-01, 8.46220743e-02, 4.76165511e-02, 1.09368344e-02,
       5.88573773e-03, 3.24464641e-03, 2.37829604e-03])

In [49]:
mu_hat_m

array([-6.19149059e+02, -9.10604406e+00, -3.00251408e+00,  3.21867937e+00,
        6.62722209e-01, -1.49641755e-03,  1.33728677e-01, -6.56051800e-03,
        4.44266350e-02, -2.27845949e-02, -5.28254216e-02,  2.13821710e-03,
        2.13592658e-03, -4.87610367e-03,  1.12627757e-04])

In [50]:
# Calculating Prior for malignant data.
malignant_prior = training_data[training_data['diagnosis'] == 1].shape[0]/training_data.shape[0]

In [51]:
# Taking mean of the benign tumors training data
mu_hat_b = np.array(training_data[training_data['diagnosis'] == 0].iloc[:,0:15].mean())

In [52]:
# Taking mean of the benign tumors covariance data.
sigma_hat_b = np.array(training_data[training_data['diagnosis'] == 0].iloc[:,0:15].cov())

In [53]:
np.linalg.det(sigma_hat_b)

1.8304458896984253e-06

In [54]:
# Calculating Prior for benign data.
benign_prior = training_data[training_data['diagnosis'] == 0].shape[0]/training_data.shape[0]

# Testing Model

In [55]:
def mock_test(data):
    
    inputs = np.array(data.iloc[:,0:15]) # taking data except for class label
    
    posterior_m = s.multivariate_normal.pdf(inputs,mu_hat_m,sigma_hat_m)*malignant_prior #calculating posterior for malignant.
    
    posterior_b = s.multivariate_normal.pdf(inputs,mu_hat_b,sigma_hat_b)*benign_prior #calculating posterior for benign.
    
    boolean_mask = posterior_m > posterior_b # checking condition if probability of tumor belonging to malignant is higher or not than benign.
    
    predicted_category = pd.Series(boolean_mask)
    
    predicted_category.replace(to_replace=[False,True],value=[0,1],inplace=True)
    
    return np.array(predicted_category)

In [56]:
cv_results = mock_test(cross_validation_data)

In [57]:
cv_results

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1], dtype=int64)

In [58]:
acutal_results = np.array(cross_validation_data['diagnosis'])

In [59]:
acutal_results

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0], dtype=int64)

In [60]:
boolean_mask = cv_results == acutal_results

In [61]:
cv_accuracy = np.count_nonzero(boolean_mask)/boolean_mask.shape[0]

In [62]:
# Cross Validation Accuracy
cv_accuracy

0.9292035398230089

In [63]:
test_results = mock_test(testing_data)

In [64]:
test_results

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1], dtype=int64)

In [65]:
actual_test_results = np.array(testing_data['diagnosis'])

In [66]:
actual_test_results

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [67]:
boolean_mask_1 = test_results == actual_test_results

In [68]:
test_accuracy = np.count_nonzero(boolean_mask_1)/boolean_mask_1.shape[0]

In [69]:
# Testing Accuracy
test_accuracy

0.9827586206896551

In [70]:
def perf_measure(y_actual, y_pred):
    
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    
    for i in range(len(y_pred)): 
        if y_actual[i]==1 and y_pred[i]==1: # Calculating True Positive
            TP+=1
        if y_pred[i]==1 and y_actual[i]==0: # Calculating False Positive
            FP += 1
        if y_actual[i]==0 and y_pred[i]==0: # Calculating True Negative
            TN += 1
        if y_pred[i]==0 and y_actual[i]==1: # Calculating False Negative
            FN += 1
    
    Precision = (TP/(TP+FP))
    Recall = (TP/(TP+FN))
    F1_Score = (2*(Precision*Recall))/(Precision+Recall)
    
    return "Precision is "+str(Precision), "Recall is "+str(Recall), "F1_Score is "+str(F1_Score)

In [71]:
perf_measure(actual_test_results,test_results)

('Precision is 1.0',
 'Recall is 0.9230769230769231',
 'F1_Score is 0.9600000000000001')