In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

df=pd.read_csv("cancer.csv")
df=df.drop('id',axis=1).drop('Unnamed: 32',axis=1)

In [2]:
df['texture_mean']=np.sqrt(np.sqrt(np.sqrt(df['texture_mean'])))
df['perimeter_mean']=np.log(np.log(np.log(df['perimeter_mean'])))
df['smoothness_mean']=np.sqrt(np.sqrt(np.sqrt(df['smoothness_mean'])))
df['compactness_mean']=np.log(df['compactness_mean'])
df['symmetry_mean']=np.log(df['symmetry_mean'])
df['fractal_dimension_mean']=np.log(np.sqrt(df['fractal_dimension_mean']))
df['texture_se']=np.log(df['texture_se'])
df['smoothness_se']=np.log(np.sqrt(df['smoothness_se']))
df['area_se']=np.log(np.log(np.log(df['area_se'])))
df['concave points_se']=np.sqrt(df['concave points_se'])
df['symmetry_se']=np.log(np.sqrt(df['symmetry_se']))
df['fractal_dimension_se']=np.log(np.sqrt(np.sqrt(df['fractal_dimension_se'])))

In [3]:
f_drop=['radius_worst','texture_worst','perimeter_worst','area_worst','smoothness_worst','compactness_worst',\
        'concavity_worst','concave points_worst','symmetry_worst','fractal_dimension_worst','radius_mean',\
        'concavity_se','perimeter_se','concavity_mean','area_mean','compactness_se','radius_se','concave points_mean']
df=df.drop(f_drop,axis=1)
df.head()

Unnamed: 0,diagnosis,texture_mean,perimeter_mean,smoothness_mean,compactness_mean,symmetry_mean,fractal_dimension_mean,texture_se,area_se,smoothness_se,concave points_se,symmetry_se,fractal_dimension_se
0,M,1.339753,0.451593,0.765895,-1.281574,-1.419231,-1.270993,-0.099489,0.47997,-2.525807,0.125976,-1.752779,-1.271084
1,M,1.432884,0.461915,0.734533,-2.542875,-1.708154,-1.435255,-0.309382,0.378307,-2.62715,0.115758,-2.138293,-1.411473
2,M,1.465277,0.459061,0.758536,-1.833207,-1.57552,-1.406789,-0.239654,0.414572,-2.545652,0.143457,-1.89712,-1.347006
3,M,1.457641,0.385587,0.783839,-1.259133,-1.348228,-1.164259,0.144966,0.178339,-2.349191,0.136638,-1.409798,-1.171921
4,M,1.394982,0.464025,0.750175,-2.018911,-1.709811,-1.416552,-0.246796,0.415204,-2.233139,0.137295,-2.021066,-1.318894


In [4]:
df['diagnosis'].value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

In [5]:
from sklearn.preprocessing import LabelEncoder
le_diagnosis=LabelEncoder()
df['diagnosis_code']=le_diagnosis.fit_transform(df['diagnosis'])
df[['diagnosis','diagnosis_code']].head()

Unnamed: 0,diagnosis,diagnosis_code
0,M,1
1,M,1
2,M,1
3,M,1
4,M,1


In [6]:
df.drop('diagnosis',axis=1,inplace=True)
df['diagnosis_code'].head()

0    1
1    1
2    1
3    1
4    1
Name: diagnosis_code, dtype: int32

In [7]:
df['diagnosis_code'].value_counts()

0    357
1    212
Name: diagnosis_code, dtype: int64

In [8]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2,f_classif
x=df.drop("diagnosis_code",axis=1)
y=df['diagnosis_code']

In [9]:
bestfeatures=SelectKBest(score_func=f_classif,k=3)
fit=bestfeatures.fit(x,y)

In [10]:
dfscores=pd.DataFrame(fit.scores_)
dfcolumns=pd.DataFrame(x.columns)
featurescores=pd.concat([dfcolumns,dfscores],axis=1)
featurescores.columns=['Specs','Scores']

In [11]:
print(featurescores)

                     Specs      Scores
0             texture_mean  126.842106
1           perimeter_mean  658.322918
2          smoothness_mean   86.789422
3         compactness_mean  312.873240
4            symmetry_mean   70.965034
5   fractal_dimension_mean    0.252105
6               texture_se    0.511491
7                  area_se  532.617592
8            smoothness_se    1.855606
9        concave points_se  124.571606
10             symmetry_se    1.671978
11    fractal_dimension_se   16.078365


In [12]:
df1=featurescores.nlargest(9,'Scores')
print(df1)

                   Specs      Scores
1         perimeter_mean  658.322918
7                area_se  532.617592
3       compactness_mean  312.873240
0           texture_mean  126.842106
9      concave points_se  124.571606
2        smoothness_mean   86.789422
4          symmetry_mean   70.965034
11  fractal_dimension_se   16.078365
8          smoothness_se    1.855606


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Train Data with 9 Input Features

In [14]:
x=df.drop(['diagnosis_code','fractal_dimension_se','symmetry_se','texture_se'],axis=1)
y=df.diagnosis_code

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=4)

logmodel=LogisticRegression()
logmodel.fit(x_train,y_train)

predictions=logmodel.predict(x_test)

r=confusion_matrix(y_test,predictions)
print(r)

[[107  10]
 [  3  51]]


In [15]:
print("accuracy:",accuracy_score(y_test,predictions)*100)
print("True Positive Rate (TPR) :",(r[0][0]/(r[0][0]+r[1][0]))*100)
print("True Negative Rate (TNR) :",(r[1][1]/(r[1][1]+r[0][1]))*100)
print("False Negative Rate (FNR) :",(r[1][0]/(r[1][0]+r[0][0]))*100)
print("False Positive Rate (FPR) :",(r[0][1]/(r[0][1]+r[1][1]))*100)
x1=(classification_report(y_test,predictions))
print(x1)

accuracy: 92.39766081871345
True Positive Rate (TPR) : 97.27272727272728
True Negative Rate (TNR) : 83.60655737704919
False Negative Rate (FNR) : 2.727272727272727
False Positive Rate (FPR) : 16.39344262295082
              precision    recall  f1-score   support

           0       0.97      0.91      0.94       117
           1       0.84      0.94      0.89        54

   micro avg       0.92      0.92      0.92       171
   macro avg       0.90      0.93      0.91       171
weighted avg       0.93      0.92      0.93       171



# Train Data With 10 Input Features

In [16]:
x=df.drop(['diagnosis_code','fractal_dimension_se','texture_se',],axis=1)
y=df.diagnosis_code

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=4)

logmodel=LogisticRegression()
logmodel.fit(x_train,y_train)

predictions=logmodel.predict(x_test)

r=confusion_matrix(y_test,predictions)
print(r)

[[107  10]
 [  3  51]]


In [17]:
print("accuracy:",accuracy_score(y_test,predictions)*100)
print("True Positive Rate (TPR) :",(r[0][0]/(r[0][0]+r[1][0]))*100)
print("True Negative Rate (TNR) :",(r[1][1]/(r[1][1]+r[0][1]))*100)
print("False Negative Rate (FNR) :",(r[1][0]/(r[1][0]+r[0][0]))*100)
print("False Positive Rate (FPR) :",(r[0][1]/(r[0][1]+r[1][1]))*100)
x1=(classification_report(y_test,predictions))
print(x1)

accuracy: 92.39766081871345
True Positive Rate (TPR) : 97.27272727272728
True Negative Rate (TNR) : 83.60655737704919
False Negative Rate (FNR) : 2.727272727272727
False Positive Rate (FPR) : 16.39344262295082
              precision    recall  f1-score   support

           0       0.97      0.91      0.94       117
           1       0.84      0.94      0.89        54

   micro avg       0.92      0.92      0.92       171
   macro avg       0.90      0.93      0.91       171
weighted avg       0.93      0.92      0.93       171



# Train Data With 11 Input Features


In [18]:
x=df.drop(['diagnosis_code','fractal_dimension_se'],axis=1)
y=df.diagnosis_code

X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=4)

logmodel=LogisticRegression()
logmodel.fit(X_train,y_train)

predictions=logmodel.predict(X_test)

r=confusion_matrix(y_test,predictions)
print(r)

[[108   9]
 [  4  50]]


In [19]:
print("accuracy:",accuracy_score(y_test,predictions)*100)
print("True Positive Rate (TPR) :",(r[0][0]/(r[0][0]+r[1][0]))*100)
print("True Negative Rate (TNR) :",(r[1][1]/(r[1][1]+r[0][1]))*100)
print("False Negative Rate (FNR) :",(r[1][0]/(r[1][0]+r[0][0]))*100)
print("False Positive Rate (FPR) :",(r[0][1]/(r[0][1]+r[1][1]))*100)
x1=(classification_report(y_test,predictions))
print(x1)

accuracy: 92.39766081871345
True Positive Rate (TPR) : 96.42857142857143
True Negative Rate (TNR) : 84.7457627118644
False Negative Rate (FNR) : 3.571428571428571
False Positive Rate (FPR) : 15.254237288135593
              precision    recall  f1-score   support

           0       0.96      0.92      0.94       117
           1       0.85      0.93      0.88        54

   micro avg       0.92      0.92      0.92       171
   macro avg       0.91      0.92      0.91       171
weighted avg       0.93      0.92      0.92       171



# Train Data With 12 Input Features

In [20]:
x=df.drop(['diagnosis_code'],axis=1)
y=df.diagnosis_code

X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=4)

logmodel=LogisticRegression()
logmodel.fit(X_train,y_train)

predictions=logmodel.predict(X_test)

r=confusion_matrix(y_test,predictions)
print(r)

[[107  10]
 [  4  50]]


In [21]:
print("accuracy:",accuracy_score(y_test,predictions)*100)
print("True Positive Rate (TPR) :",(r[0][0]/(r[0][0]+r[1][0]))*100)
print("True Negative Rate (TNR) :",(r[1][1]/(r[1][1]+r[0][1]))*100)
print("False Negative Rate (FNR) :",(r[1][0]/(r[1][0]+r[0][0]))*100)
print("False Positive Rate (FPR) :",(r[0][1]/(r[0][1]+r[1][1]))*100)
x1=(classification_report(y_test,predictions))
print(x1)

accuracy: 91.81286549707602
True Positive Rate (TPR) : 96.3963963963964
True Negative Rate (TNR) : 83.33333333333334
False Negative Rate (FNR) : 3.6036036036036037
False Positive Rate (FPR) : 16.666666666666664
              precision    recall  f1-score   support

           0       0.96      0.91      0.94       117
           1       0.83      0.93      0.88        54

   micro avg       0.92      0.92      0.92       171
   macro avg       0.90      0.92      0.91       171
weighted avg       0.92      0.92      0.92       171

