# Classification of Breast Cancer (NAIVE BAYES CLASSIFIER)

### Classifying whether the Cancer is Benign or Malignant on the basis of Tumor Size 

In [1]:
import pandas as pd
from math import e
import numpy as np
import scipy.stats as s
import random

In [2]:
dataset = pd.read_csv("data.csv")

In [3]:
dataset

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.990,10.38,122.80,1001.0,0.11840,0.27760,0.300100,0.147100,...,17.33,184.60,2019.0,0.16220,0.66560,0.71190,0.26540,0.4601,0.11890,
1,842517,M,20.570,17.77,132.90,1326.0,0.08474,0.07864,0.086900,0.070170,...,23.41,158.80,1956.0,0.12380,0.18660,0.24160,0.18600,0.2750,0.08902,
2,84300903,M,19.690,21.25,130.00,1203.0,0.10960,0.15990,0.197400,0.127900,...,25.53,152.50,1709.0,0.14440,0.42450,0.45040,0.24300,0.3613,0.08758,
3,84348301,M,11.420,20.38,77.58,386.1,0.14250,0.28390,0.241400,0.105200,...,26.50,98.87,567.7,0.20980,0.86630,0.68690,0.25750,0.6638,0.17300,
4,84358402,M,20.290,14.34,135.10,1297.0,0.10030,0.13280,0.198000,0.104300,...,16.67,152.20,1575.0,0.13740,0.20500,0.40000,0.16250,0.2364,0.07678,
5,843786,M,12.450,15.70,82.57,477.1,0.12780,0.17000,0.157800,0.080890,...,23.75,103.40,741.6,0.17910,0.52490,0.53550,0.17410,0.3985,0.12440,
6,844359,M,18.250,19.98,119.60,1040.0,0.09463,0.10900,0.112700,0.074000,...,27.66,153.20,1606.0,0.14420,0.25760,0.37840,0.19320,0.3063,0.08368,
7,84458202,M,13.710,20.83,90.20,577.9,0.11890,0.16450,0.093660,0.059850,...,28.14,110.60,897.0,0.16540,0.36820,0.26780,0.15560,0.3196,0.11510,
8,844981,M,13.000,21.82,87.50,519.8,0.12730,0.19320,0.185900,0.093530,...,30.73,106.20,739.3,0.17030,0.54010,0.53900,0.20600,0.4378,0.10720,
9,84501001,M,12.460,24.04,83.97,475.9,0.11860,0.23960,0.227300,0.085430,...,40.68,97.65,711.4,0.18530,1.05800,1.10500,0.22100,0.4366,0.20750,


In [4]:
print(dataset.shape)
dataset.iloc[0]

(569, 33)


id                           842302
diagnosis                         M
radius_mean                   17.99
texture_mean                  10.38
perimeter_mean                122.8
area_mean                      1001
smoothness_mean              0.1184
compactness_mean             0.2776
concavity_mean               0.3001
concave points_mean          0.1471
symmetry_mean                0.2419
fractal_dimension_mean      0.07871
radius_se                     1.095
texture_se                   0.9053
perimeter_se                  8.589
area_se                       153.4
smoothness_se              0.006399
compactness_se              0.04904
concavity_se                0.05373
concave points_se           0.01587
symmetry_se                 0.03003
fractal_dimension_se       0.006193
radius_worst                  25.38
texture_worst                 17.33
perimeter_worst               184.6
area_worst                     2019
smoothness_worst             0.1622
compactness_worst           

In [5]:
data = dataset.iloc[:,1:3]
data

Unnamed: 0,diagnosis,radius_mean
0,M,17.990
1,M,20.570
2,M,19.690
3,M,11.420
4,M,20.290
5,M,12.450
6,M,18.250
7,M,13.710
8,M,13.000
9,M,12.460


## Using Bayes Theorem For Classification


#### Bayes' theorem is stated mathematically as the following equation:

$ P(A/B) =  \dfrac{P(B/A)P(A)}{P(B)} $

#### where $A$ and $B$ are events and $P(B) \not= 0 $. 

####  $P(A/B)$ is a conditional probability: the likelihood of event $A$ occurring given that $B$ is true. Also $P(A/B)$ is called the Posterior Probability 

#### $P(B/A)$ is also a conditional probability: the likelihood of event $B$ occurring given that  $A$ is true. 

####  $P(A)$ and $P(B)$ are the probabilities of observing $A$and $B$ independently of each other; this is known as the marginal probability.

#### $P(A)$ is called the Prior Probability and $P(B)$ is called the Evidence.

####  We will find $ P(A/B)$ i.e Probability that the Tumor is Benign on the basis of Tumor Size 

#### We can easily find $P(B/A)$ i.e Probability of size when all the cancers are Benign. And Similary $P(A)$ (Probability that it is Benign) and $P(B)$ (Probabilty of a Size $S0$ in complete training set).

### Splitting the Data and Calculating all the R.H.S values for Training Data 

In [6]:
traintestsplitindex = int(len(data)*0.70)
traintestsplitindex

398

######  Splitting will be done evenly. Therefore selecting equal B and M

In [7]:
#Bdata is the data set of benign 
#data['diagnosis'] == 'B' is boolean mapping
Bdata = data[data['diagnosis'] == 'B']
print(Bdata.shape)

#Training Data of Benign length
Tb = int(traintestsplitindex/2)

#Training Data for Benign
TBdata = Bdata.iloc[0:(Tb)]
TBdata

(357, 2)


Unnamed: 0,diagnosis,radius_mean
19,B,13.540
20,B,13.080
21,B,9.504
37,B,13.030
46,B,8.196
48,B,12.050
49,B,13.490
50,B,11.760
51,B,13.640
52,B,11.940


In [8]:
#Mdata is Malignant Data
Mdata = data[data['diagnosis'] == 'M']
print(Mdata.shape)
#Trainig data length for Malignant
Tm= int(traintestsplitindex/2)

#Malignant Training Data
TMdata = Mdata[0:Tm]
TMdata

(212, 2)


Unnamed: 0,diagnosis,radius_mean
0,M,17.99
1,M,20.57
2,M,19.69
3,M,11.42
4,M,20.29
5,M,12.45
6,M,18.25
7,M,13.71
8,M,13.00
9,M,12.46


In [9]:
TrainData = Bdata[0:Tb].append(Mdata[0:Tm])
TrainData

Unnamed: 0,diagnosis,radius_mean
19,B,13.540
20,B,13.080
21,B,9.504
37,B,13.030
46,B,8.196
48,B,12.050
49,B,13.490
50,B,11.760
51,B,13.640
52,B,11.940


In [10]:
BTestData = Bdata[Tb:]
MTestData = Mdata[Tb:]

###  Calculating  $\hat{u_b}$ , $\hat{\sigma_b}$ , $\hat{u_m}$  and  $\hat{\sigma_m}$

In [11]:
#Mean Cap of Benign Data(with minimum error)
MuBCap = TBdata['radius_mean'].mean()
#Note
#If you want to decrease the error between Mean Cap and Actual Mean of Population
#Then do the same on different datasets atleast 25 times
#Then calculate the maen of that means


In [12]:
#Standard Deviation Cap of Benign Data
SigmaBCap = TBdata['radius_mean'].std()


In [13]:
print(MuBCap,SigmaBCap)

11.984396984924626 1.714910993417559


In [14]:
#P(A) => Prior Probability
#Probability that it is Benign cancer
#Number of Benign Cancer / Total Data

PCapB = TBdata.shape[0]/TrainData.shape[0]
PCapB

0.5

In [15]:
#P(ABar) = P(A_Compliment)
PCapBCompliment = 1 - PCapB
PCapBCompliment

0.5

In [16]:
#Mean Cap of Malignant Data(with minimum error)
MuMCap = TMdata['radius_mean'].mean()
#Standard Deviation of Malignant Data(with minimum error)
SigmaMCap = TMdata['radius_mean'].std()

In [17]:
print(MuMCap,SigmaMCap)

17.356381909547743 3.1938348082988552


###  Testing the Model

####  We will pass the Test Data and check $P(A/B)$ and Compare the result with actual result

In [19]:
#P(Stest/Tumor is Benign)
#P(B/A) => (Likely Hood)
# P_BA = (1/((2*3.14*SigmaBcap)**0.5)) * (exp(-(Stest-MuBCap)**2/(2*SigmaBCap**2)))

P_BABenign = s.norm.pdf(BTestData['radius_mean'],MuBCap,SigmaBCap)
P_BAMalignant = s.norm.pdf(MTestData['radius_mean'],MuBCap,SigmaBCap)

In [20]:
P_BAComplementBenign = s.norm.pdf(BTestData['radius_mean'],MuMCap,SigmaMCap)
P_BAComplementMalignant = s.norm.pdf(MTestData['radius_mean'],MuMCap,SigmaMCap)

In [21]:
num1 = P_BABenign*PCapB
num2 = P_BAComplementBenign*PCapBCompliment
num3 = P_BAMalignant * PCapB
num4 = P_BAComplementMalignant*PCapBCompliment

##### $P(B)$ = $P(B/A).P(A)+ P(B/\bar{A}).P(\bar{A})$ 

In [22]:
P_B = num1 + num2

In [23]:
PTumorisbenign = num1/P_B
PTumorisbenign

array([0.84637498, 0.75660059, 0.82565658, 0.05684475, 0.7404716 ,
       0.87117977, 0.29004891, 0.68700957, 0.09214812, 0.9268454 ,
       0.73023855, 0.69301404, 0.91300359, 0.9187763 , 0.88093662,
       0.85850021, 0.7596905 , 0.87117977, 0.64641654, 0.91300359,
       0.92987263, 0.92199874, 0.87802958, 0.603971  , 0.72138548,
       0.82147681, 0.91832664, 0.89372757, 0.88742343, 0.80336286,
       0.80575391, 0.86221684, 0.92088518, 0.09607591, 0.81606061,
       0.86716494, 0.91036135, 0.9187763 , 0.93014881, 0.34535714,
       0.88944002, 0.93020001, 0.83167932, 0.91593244, 0.90324895,
       0.43187462, 0.90173115, 0.69301404, 0.93143248, 0.9310853 ,
       0.92789691, 0.92346671, 0.91667673, 0.82970407, 0.85773805,
       0.38259734, 0.8138331 , 0.60887096, 0.65311877, 0.61372611,
       0.92027647, 0.66833208, 0.9268454 , 0.88426409, 0.39996217,
       0.47794345, 0.89042224, 0.8837214 , 0.47794345, 0.83926727,
       0.74378915, 0.90095136, 0.77015775, 0.79846751, 0.93144

In [24]:
PTumorisMalignant = num3/(num3+num4)
PTumorisMalignant

array([3.28454134e-01, 2.15800548e-03, 6.19468585e-05, 3.88167649e-11,
       1.44480566e-05, 1.17444269e-05, 5.49989662e-01, 2.82047728e-01,
       4.41503131e-06, 7.51629069e-07, 3.42536363e-05, 4.87043591e-02,
       1.03098928e-05])

In [26]:
truenegative = 0
truepositive = 0
falsenegative = 0
falsepositive = 0

for i in PTumorisbenign:
    if i >= 0.5:
        truenegative += 1
    else:
        falsepositive += 1
        

for i in PTumorisMalignant:
    if i >=  0.5:
        truepositive += 1
    else:
        falsenegative += 1

In [27]:
print("truenegative: ",truenegative)
print("truepositive: ",truepositive)
print("falsepositive: ",falsepositive)
print("falsenegative: ",falsenegative)

print('Total Benign :', len(BTestData))
print('Total Malignant :', len(MTestData))

truenegative:  136
truepositive:  1
falsepositive:  22
falsenegative:  12
Total Benign : 158
Total Malignant : 13
