In [2]:
# Importing modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
cancer_data = pd.read_csv('Dataset\cancer.csv')
cancer_data.head()

Unnamed: 0,Id,Diagnosis,Radius (mean),Texture (mean),Perimeter (mean),Area (mean),Smoothness (mean),Compactness (mean),Concavity (mean),Concave points (mean),...,Radius (worst),Texture (worst),Perimeter (worst),Area (worst),Smoothness (worst),Compactness (worst),Concavity (worst),Concave points (worst),Symmetry (worst),Fractal dimension (worst)
0,8510426,B,13.54,14.36,87.46,566.3,0.09779,0.08129,0.06664,0.04781,...,15.11,19.26,99.7,711.2,0.144,0.1773,0.239,0.1288,0.2977,0.07259
1,8510653,B,13.08,15.71,85.63,520.0,0.1075,0.127,0.04568,0.0311,...,14.5,20.49,96.09,630.5,0.1312,0.2776,0.189,0.07283,0.3184,0.08183
2,8510824,B,9.504,12.44,60.34,273.9,0.1024,0.06492,0.02956,0.02076,...,10.23,15.66,65.13,314.9,0.1324,0.1148,0.08867,0.06227,0.245,0.07773
3,854941,B,13.03,18.42,82.61,523.8,0.08983,0.03766,0.02562,0.02923,...,13.3,22.81,84.46,545.9,0.09701,0.04619,0.04833,0.05013,0.1987,0.06169
4,85713702,B,8.196,16.84,51.71,201.9,0.086,0.05943,0.01588,0.005917,...,8.964,21.96,57.26,242.2,0.1297,0.1357,0.0688,0.02564,0.3105,0.07409


In [4]:
# Since Id is not useful for our analysis, we remove it.
cancer_data.drop(columns= 'Id', axis=1, inplace= True)

In [5]:
cancer_data.head()

Unnamed: 0,Diagnosis,Radius (mean),Texture (mean),Perimeter (mean),Area (mean),Smoothness (mean),Compactness (mean),Concavity (mean),Concave points (mean),Symmetry (mean),...,Radius (worst),Texture (worst),Perimeter (worst),Area (worst),Smoothness (worst),Compactness (worst),Concavity (worst),Concave points (worst),Symmetry (worst),Fractal dimension (worst)
0,B,13.54,14.36,87.46,566.3,0.09779,0.08129,0.06664,0.04781,0.1885,...,15.11,19.26,99.7,711.2,0.144,0.1773,0.239,0.1288,0.2977,0.07259
1,B,13.08,15.71,85.63,520.0,0.1075,0.127,0.04568,0.0311,0.1967,...,14.5,20.49,96.09,630.5,0.1312,0.2776,0.189,0.07283,0.3184,0.08183
2,B,9.504,12.44,60.34,273.9,0.1024,0.06492,0.02956,0.02076,0.1815,...,10.23,15.66,65.13,314.9,0.1324,0.1148,0.08867,0.06227,0.245,0.07773
3,B,13.03,18.42,82.61,523.8,0.08983,0.03766,0.02562,0.02923,0.1467,...,13.3,22.81,84.46,545.9,0.09701,0.04619,0.04833,0.05013,0.1987,0.06169
4,B,8.196,16.84,51.71,201.9,0.086,0.05943,0.01588,0.005917,0.1769,...,8.964,21.96,57.26,242.2,0.1297,0.1357,0.0688,0.02564,0.3105,0.07409


In [6]:
# Getting Dataset Info
cancer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Diagnosis                  569 non-null    object 
 1   Radius (mean)              569 non-null    float64
 2   Texture (mean)             569 non-null    float64
 3   Perimeter (mean)           569 non-null    float64
 4   Area (mean)                569 non-null    float64
 5   Smoothness (mean)          569 non-null    float64
 6   Compactness (mean)         569 non-null    float64
 7   Concavity (mean)           569 non-null    float64
 8   Concave points (mean)      569 non-null    float64
 9   Symmetry (mean)            569 non-null    float64
 10  Fractal dimension (mean)   569 non-null    float64
 11  Radius (se)                569 non-null    float64
 12  Texture (se)               569 non-null    float64
 13  Perimeter (se)             569 non-null    float64

In [7]:
# We need to create a classification model to classify whether the tumor is benign or malignant.
# Step 1: Splitting independent and dependent data
x = cancer_data.drop(columns= 'Diagnosis', axis= 1)
y = cancer_data['Diagnosis']

In [8]:
x.shape, y.shape

((569, 30), (569,))

In [9]:
# Step 2: Feature Scaling and Label Encoding
from sklearn.preprocessing import StandardScaler, LabelEncoder
le = LabelEncoder()
sc = StandardScaler()
x = sc.fit_transform(x)
y = le.fit_transform(y)

In [10]:
x

array([[-0.16679919, -1.1471623 , -0.18572799, ...,  0.21612292,
         0.12334653, -0.62929189],
       [-0.29744572, -0.83300824, -0.26110605, ..., -0.63610973,
         0.45822712, -0.11724974],
       [-1.31308049, -1.59395919, -1.30280622, ..., -0.7969026 ,
        -0.72922385, -0.34445459],
       ...,
       [ 1.70485436,  2.08513394,  1.61593137, ...,  0.73382724,
        -0.53185462, -0.97397828],
       [ 0.70228425,  2.0455738 ,  0.67267578, ...,  0.41406869,
        -1.10454895, -0.31840916],
       [ 1.83834103,  2.33645719,  1.98252415, ...,  2.28998549,
         1.91908301,  2.21963528]])

In [11]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [12]:
# Step 3: Splitting the data into training set and test set
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size= 0.3)

In [13]:
# Step 4: Building the model
from sklearn.ensemble import RandomForestClassifier as clsf
classifier = clsf()

In [14]:
classifier.fit(xtrain, ytrain)
ypred= classifier.predict(xtest)

In [15]:
# Building metrics
from sklearn.metrics import classification_report, accuracy_score


              precision    recall  f1-score   support

           0       0.96      0.98      0.97       109
           1       0.97      0.92      0.94        62

    accuracy                           0.96       171
   macro avg       0.96      0.95      0.96       171
weighted avg       0.96      0.96      0.96       171

95.90643274853801 % accuracy


In [20]:
print(classification_report(ytest, ypred))
print('Accuracy: %.2f%% ' %(accuracy_score(ytest, ypred)*100))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       109
           1       0.97      0.92      0.94        62

    accuracy                           0.96       171
   macro avg       0.96      0.95      0.96       171
weighted avg       0.96      0.96      0.96       171

Accuracy: 95.91% 


In [16]:
# Cross Validating the Model
from sklearn.model_selection import cross_val_score
cv= cross_val_score(classifier, xtrain, ytrain, cv= 10)
print('Average model accuracy: %.2f%%' %(cv.mean()*100))
print('Standard deviation among model accuracy: %.2f%%' %(cv.std()*100))

Average model accuracy: 96.47%
Standard deviation among model accuracy: 2.02%
