## Importing necessary modules

In [80]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [81]:
import warnings
warnings.filterwarnings('ignore')

## Importing dataset and converting into a pandas dataframe

In [82]:
df=pd.read_csv("breast-cancer.csv")
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## Data analysis

In [83]:
df.shape #to print no of rows and columns in data

(569, 32)

In [84]:
df.isnull().sum()

id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

Therefore there are no missing values in any column of the given dataset.

In [85]:
df=df.drop(['id'], axis=1) #to drop the id column since pandas provide default indexing

In [86]:
#to encode the diagnosis column into numerical data [M:1,B:0]
labelencoder_X=LabelEncoder() 
df.iloc[:,0]=labelencoder_X.fit_transform(df.iloc[:,0].values)

In [87]:
df.tail(10)

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
559,0,11.51,23.93,74.52,403.5,0.09261,0.1021,0.1112,0.04105,0.1388,...,12.48,37.16,82.28,474.2,0.1298,0.2517,0.363,0.09653,0.2112,0.08732
560,0,14.05,27.15,91.38,600.4,0.09929,0.1126,0.04462,0.04304,0.1537,...,15.3,33.17,100.2,706.7,0.1241,0.2264,0.1326,0.1048,0.225,0.08321
561,0,11.2,29.37,70.67,386.0,0.07449,0.03558,0.0,0.0,0.106,...,11.92,38.3,75.19,439.6,0.09267,0.05494,0.0,0.0,0.1566,0.05905
562,1,15.22,30.62,103.4,716.9,0.1048,0.2087,0.255,0.09429,0.2128,...,17.52,42.79,128.7,915.0,0.1417,0.7917,1.17,0.2356,0.4089,0.1409
563,1,20.92,25.09,143.0,1347.0,0.1099,0.2236,0.3174,0.1474,0.2149,...,24.29,29.41,179.1,1819.0,0.1407,0.4186,0.6599,0.2542,0.2929,0.09873
564,1,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,...,25.45,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115
565,1,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,...,23.69,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637
566,1,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,0.159,...,18.98,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782
567,1,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,...,25.74,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124
568,0,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,0.1587,...,9.456,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039


## Splitting data for dependence

In [88]:
X=df.iloc[:,1:].values#Features of cancerous and non cancerous patients
Y=df.iloc[:,0].values#Whether patient has cancer or not

# Data Pre-processing

## Normalizing the data 

In [89]:
min_max=MinMaxScaler()
X=min_max.fit_transform(X)
t_df=pd.DataFrame(X,columns=["radius_mean","texture_mean","perimeter_mean","area_mean","smoothness_mean","compactness_mean","concavity_mean","concave points_mean","symmetry_mean","fractal_dimension_mean","radius_se","texture_se","perimeter_se","area_se","smoothness_se","compactness_se","concavity_se","concave points_se","symmetry_se","fractal_dimension_se","radius_worst","texture_worst","perimeter_worst","area_worst","smoothness_worst","compactness_worst","concavity_worst","concave points_worst","symmetry_worst","fractal_dimension_worst"])
t_df.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,0.521037,0.022658,0.545989,0.363733,0.593753,0.792037,0.70314,0.731113,0.686364,0.605518,...,0.620776,0.141525,0.66831,0.450698,0.601136,0.619292,0.56861,0.912027,0.598462,0.418864
1,0.643144,0.272574,0.615783,0.501591,0.28988,0.181768,0.203608,0.348757,0.379798,0.141323,...,0.606901,0.303571,0.539818,0.435214,0.347553,0.154563,0.192971,0.639175,0.23359,0.222878
2,0.601496,0.39026,0.595743,0.449417,0.514309,0.431017,0.462512,0.635686,0.509596,0.211247,...,0.556386,0.360075,0.508442,0.374508,0.48359,0.385375,0.359744,0.835052,0.403706,0.213433
3,0.21009,0.360839,0.233501,0.102906,0.811321,0.811361,0.565604,0.522863,0.776263,1.0,...,0.24831,0.385928,0.241347,0.094008,0.915472,0.814012,0.548642,0.88488,1.0,0.773711
4,0.629893,0.156578,0.630986,0.48929,0.430351,0.347893,0.463918,0.51839,0.378283,0.186816,...,0.519744,0.123934,0.506948,0.341575,0.437364,0.172415,0.319489,0.558419,0.1575,0.142595


## Train-Test Split

In [90]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=1)

## Upsampling 

In [91]:
print(pd.DataFrame(Y_train).value_counts())

0    285
1    170
dtype: int64


since the value of 1 is less than 0, so the data needs upsampling.

In [92]:
over_sampling = RandomOverSampler(random_state=42)
X_train_over, Y_train_over  = over_sampling.fit_resample(X_train, Y_train)

In [93]:
print("After oversampling: \n", pd.DataFrame(Y_train_over).value_counts())

After oversampling: 
 0    285
1    285
dtype: int64


## Data Scaling 

In [94]:
sc=StandardScaler()
X_train=sc.fit_transform(X_train)#Scaling X_train
X_test=sc.fit_transform(X_test)#Scaling X_test

# Feeding processed Dataset to the model


In [95]:
model_accuracies={}

## 1. Random Forest

In [96]:
forest = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
forest.fit(X_train, Y_train)

### Prediction and Their Evaluation

In [97]:
print("Random Forest:",forest.score(X_train,Y_train))

Random Forest: 0.9978021978021978


In [98]:
print("\nModel: Random forest")
print("Classification Report")
print(classification_report(Y_test,forest.predict(X_test)))
print("Accuracy Score:",accuracy_score(Y_test,forest.predict(X_test)))
model_accuracies['Random Forests Classifier'] = accuracy_score(Y_test,forest.predict(X_test))


Model: Random forest
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.99      0.97        72
           1       0.97      0.90      0.94        42

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114

Accuracy Score: 0.956140350877193


## 2. Logistic Regression

In [99]:
log=LogisticRegression(random_state=0)
log.fit(X_train,Y_train)

### Prediction and Evaluation

In [100]:
print("Logistic regression:",log.score(X_train,Y_train))

Logistic regression: 0.9912087912087912


In [101]:
print("\nModel: Logistic Regression")
print("Classification Report")
print(classification_report(Y_test,log.predict(X_test)))
print("Accuracy Score:",accuracy_score(Y_test,log.predict(X_test)))
model_accuracies['Logistic Regression'] = accuracy_score(Y_test,log.predict(X_test))


Model: Logistic Regression
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.99      0.98        72
           1       0.98      0.95      0.96        42

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

Accuracy Score: 0.9736842105263158


## 3. K-Nearest Neighbour

In [102]:
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn.fit(X_train, Y_train)


###  prediction and evaluation

In [103]:
print("K-Nearest Neighbour:",knn.score(X_train,Y_train))

K-Nearest Neighbour: 0.9824175824175824


In [104]:
print("\nModel: K-Nearest Neighbour")
print("Classification Report")
print(classification_report(Y_test,knn.predict(X_test)))
print("Accuracy Score:",accuracy_score(Y_test,knn.predict(X_test)))
model_accuracies['K-Nearest Neighbour'] = accuracy_score(Y_test,knn.predict(X_test))


Model: K-Nearest Neighbour
Classification Report
              precision    recall  f1-score   support

           0       0.93      0.99      0.96        72
           1       0.97      0.88      0.93        42

    accuracy                           0.95       114
   macro avg       0.95      0.93      0.94       114
weighted avg       0.95      0.95      0.95       114

Accuracy Score: 0.9473684210526315


## 4. Support Vector Machine

In [105]:
svm = SVC(kernel = 'linear', random_state = 0)
svm.fit(X_train, Y_train)

###  prediction and evaluation

In [106]:
print("Support Vector Machine",svm.score(X_train,Y_train))

Support Vector Machine 0.9868131868131869


In [107]:
print("\nModel: Support Vector Machine")
print("Classification Report")
print(classification_report(Y_test,svm.predict(X_test)))
print("Accuracy Score:",accuracy_score(Y_test,svm.predict(X_test)))
model_accuracies['Support Vector Machine'] = accuracy_score(Y_test,svm.predict(X_test))


Model: Support Vector Machine
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        72
           1       0.95      0.95      0.95        42

    accuracy                           0.96       114
   macro avg       0.96      0.96      0.96       114
weighted avg       0.96      0.96      0.96       114

Accuracy Score: 0.9649122807017544


## 5. Decision Tree

In [108]:
tree=DecisionTreeClassifier(criterion='entropy',random_state=0)
tree.fit(X_train,Y_train)

### Prediction and Evaluation

In [109]:
print("Decision Tree:",tree.score(X_train,Y_train))

Decision Tree: 1.0


In [110]:
print("\nModel: Decision tree")
print("Classification Report")
print(classification_report(Y_test,tree.predict(X_test)))
print("Accuracy Score:",accuracy_score(Y_test,tree.predict(X_test)))
model_accuracies['Decision Tree Classifier'] = accuracy_score(Y_test,tree.predict(X_test))


Model: Decision tree
Classification Report
              precision    recall  f1-score   support

           0       0.94      0.93      0.94        72
           1       0.88      0.90      0.89        42

    accuracy                           0.92       114
   macro avg       0.91      0.92      0.92       114
weighted avg       0.92      0.92      0.92       114

Accuracy Score: 0.9210526315789473


# Evaluation

In [111]:
print("Performance of ML Algorithms:")
print('1. Random Forest Classifier:', str(np.round(model_accuracies['Random Forests Classifier']*100,2)) + ' %')
print('2. Logistic Regression:', str(np.round(model_accuracies['Logistic Regression']*100,2)) + ' %')
print('3. K-Nearest Neighbour:', str(np.round(model_accuracies['K-Nearest Neighbour']*100,2)) + ' %')
print('4. Support Vector Machine:', str(np.round(model_accuracies['Support Vector Machine']*100,2)) + ' %')
print('5. Decision Tree Classifier:', str(np.round(model_accuracies['Decision Tree Classifier']*100,2)) + ' %')

Performance of ML Algorithms:
1. Random Forest Classifier: 95.61 %
2. Logistic Regression: 97.37 %
3. K-Nearest Neighbour: 94.74 %
4. Support Vector Machine: 96.49 %
5. Decision Tree Classifier: 92.11 %
