In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('cancer.csv')
df.drop(['Unnamed: 32','id'],inplace=True,axis=1)

In [3]:
df['texture_mean']=np.sqrt(np.sqrt(np.sqrt(df['texture_mean'])))
df['perimeter_mean']=np.log(np.log(np.log(df['perimeter_mean'])))
df['smoothness_mean']=np.sqrt(np.sqrt(np.sqrt(df['smoothness_mean'])))
df['compactness_mean']=np.log(df['compactness_mean'])
df['symmetry_mean']=np.log(df['symmetry_mean'])
df['fractal_dimension_mean']=np.log(np.sqrt(df['fractal_dimension_mean']))
df['texture_se']=np.log(df['texture_se'])
df['smoothness_se']=np.log(np.sqrt(df['smoothness_se']))
df['area_se']=np.log(np.log(np.log(df['area_se'])))
df['concave points_se']=np.sqrt(df['concave points_se'])
df['symmetry_se']=np.log(np.sqrt(df['symmetry_se']))
df['fractal_dimension_se']=np.log(np.sqrt(np.sqrt(df['fractal_dimension_se'])))

In [4]:
from sklearn.preprocessing import LabelEncoder
le_diagnosis=LabelEncoder()
df['diagnosis_code']=le_diagnosis.fit_transform(df['diagnosis'])
df[['diagnosis','diagnosis_code']].head()

Unnamed: 0,diagnosis,diagnosis_code
0,M,1
1,M,1
2,M,1
3,M,1
4,M,1


In [5]:
df.drop('diagnosis',axis=1,inplace=True)
df['diagnosis_code'].head()

0    1
1    1
2    1
3    1
4    1
Name: diagnosis_code, dtype: int32

In [6]:
f_drop=['radius_worst','texture_worst','perimeter_worst','area_worst','smoothness_worst','compactness_worst',\
        'concavity_worst','concave points_worst','symmetry_worst','fractal_dimension_worst','radius_mean',\
        'concavity_se','perimeter_se','concavity_mean','area_mean','compactness_se','radius_se','concave points_mean']
df=df.drop(f_drop,axis=1)

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Train Data with 9 Input Featues

In [8]:
x=df.drop(['diagnosis_code','fractal_dimension_se','symmetry_se','texture_se'],axis=1)
y=df.diagnosis_code

x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=4,test_size=0.3)

model=GaussianNB()
model.fit(x_train,y_train)

predictions = model.predict(x_test)

r=confusion_matrix(y_test,predictions)
print(r)

[[105  12]
 [  2  52]]


In [9]:
print("accuracy:",accuracy_score(y_test,predictions)*100)
print("True Positive Rate (TPR) :",(r[0][0]/(r[0][0]+r[1][0]))*100)
print("True Negative Rate (TNR) :",(r[1][1]/(r[1][1]+r[0][1]))*100)
print("False Negative Rate (FNR) :",(r[1][0]/(r[1][0]+r[0][0]))*100)
print("False Positive Rate (FPR) :",(r[0][1]/(r[0][1]+r[1][1]))*100)
x1=(classification_report(y_test,predictions))
print(x1)

accuracy: 91.81286549707602
True Positive Rate (TPR) : 98.13084112149532
True Negative Rate (TNR) : 81.25
False Negative Rate (FNR) : 1.8691588785046727
False Positive Rate (FPR) : 18.75
              precision    recall  f1-score   support

           0       0.98      0.90      0.94       117
           1       0.81      0.96      0.88        54

   micro avg       0.92      0.92      0.92       171
   macro avg       0.90      0.93      0.91       171
weighted avg       0.93      0.92      0.92       171



# Train Data with 10 Input Features

In [10]:
x=df.drop(['diagnosis_code','fractal_dimension_se','texture_se',],axis=1)
y=df.diagnosis_code

x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=4,test_size=0.3)

model=GaussianNB()
model.fit(x_train,y_train)

predictions = model.predict(x_test)

r=confusion_matrix(y_test,predictions)
print(r)

[[106  11]
 [  2  52]]


In [11]:
print("accuracy:",accuracy_score(y_test,predictions)*100)
print("True Positive Rate (TPR) :",(r[0][0]/(r[0][0]+r[1][0]))*100)
print("True Negative Rate (TNR) :",(r[1][1]/(r[1][1]+r[0][1]))*100)
print("False Negative Rate (FNR) :",(r[1][0]/(r[1][0]+r[0][0]))*100)
print("False Positive Rate (FPR) :",(r[0][1]/(r[0][1]+r[1][1]))*100)
x1=(classification_report(y_test,predictions))
print(x1)

accuracy: 92.39766081871345
True Positive Rate (TPR) : 98.14814814814815
True Negative Rate (TNR) : 82.53968253968253
False Negative Rate (FNR) : 1.8518518518518516
False Positive Rate (FPR) : 17.46031746031746
              precision    recall  f1-score   support

           0       0.98      0.91      0.94       117
           1       0.83      0.96      0.89        54

   micro avg       0.92      0.92      0.92       171
   macro avg       0.90      0.93      0.92       171
weighted avg       0.93      0.92      0.93       171



# Train Data with 11 Input features

In [12]:
x=df.drop(['diagnosis_code','fractal_dimension_se'],axis=1)
y=df.diagnosis_code

x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=4,test_size=0.3)

model=GaussianNB()
model.fit(x_train,y_train)

predictions = model.predict(x_test)

r=confusion_matrix(y_test,predictions)
print(r)

[[106  11]
 [  2  52]]


In [13]:
print("accuracy:",accuracy_score(y_test,predictions)*100)
print("True Positive Rate (TPR) :",(r[0][0]/(r[0][0]+r[1][0]))*100)
print("True Negative Rate (TNR) :",(r[1][1]/(r[1][1]+r[0][1]))*100)
print("False Negative Rate (FNR) :",(r[1][0]/(r[1][0]+r[0][0]))*100)
print("False Positive Rate (FPR) :",(r[0][1]/(r[0][1]+r[1][1]))*100)
x1=(classification_report(y_test,predictions))
print(x1)

accuracy: 92.39766081871345
True Positive Rate (TPR) : 98.14814814814815
True Negative Rate (TNR) : 82.53968253968253
False Negative Rate (FNR) : 1.8518518518518516
False Positive Rate (FPR) : 17.46031746031746
              precision    recall  f1-score   support

           0       0.98      0.91      0.94       117
           1       0.83      0.96      0.89        54

   micro avg       0.92      0.92      0.92       171
   macro avg       0.90      0.93      0.92       171
weighted avg       0.93      0.92      0.93       171



# Train Data with 12 Input Features

In [14]:
x=df.drop(['diagnosis_code'],axis=1)
y=df.diagnosis_code

x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=4,test_size=0.3)

model=GaussianNB()
model.fit(x_train,y_train)

predictions = model.predict(x_test)

r=confusion_matrix(y_test,predictions)
print(r)

[[105  12]
 [  2  52]]


In [15]:
print("accuracy:",accuracy_score(y_test,predictions)*100)
print("True Positive Rate (TPR) :",(r[0][0]/(r[0][0]+r[1][0]))*100)
print("True Negative Rate (TNR) :",(r[1][1]/(r[1][1]+r[0][1]))*100)
print("False Negative Rate (FNR) :",(r[1][0]/(r[1][0]+r[0][0]))*100)
print("False Positive Rate (FPR) :",(r[0][1]/(r[0][1]+r[1][1]))*100)
x1=(classification_report(y_test,predictions))
print(x1)

accuracy: 91.81286549707602
True Positive Rate (TPR) : 98.13084112149532
True Negative Rate (TNR) : 81.25
False Negative Rate (FNR) : 1.8691588785046727
False Positive Rate (FPR) : 18.75
              precision    recall  f1-score   support

           0       0.98      0.90      0.94       117
           1       0.81      0.96      0.88        54

   micro avg       0.92      0.92      0.92       171
   macro avg       0.90      0.93      0.91       171
weighted avg       0.93      0.92      0.92       171

