## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

## load dataset

In [2]:
#Load diabetes dataset
df = pd.read_csv('BreastCancer.csv')

In [3]:
#shape of the dataset
df.shape

(699, 11)

In [4]:
#view the dataset
df.head()

Unnamed: 0,Code,Clump,UniCell_Size,Uni_CellShape,MargAdh,SEpith,BareN,BChromatin,NoemN,Mitoses,Class
0,61634,5,4,3,1,2,2,2,3,1,Benign
1,63375,9,1,2,6,4,10,7,7,2,Malignant
2,76389,10,4,7,2,2,8,6,1,1,Malignant
3,95719,6,10,10,10,8,10,7,10,7,Malignant
4,128059,1,1,1,1,2,5,5,1,1,Benign


In [5]:
del df["Code"]

In [6]:
df.head()

Unnamed: 0,Clump,UniCell_Size,Uni_CellShape,MargAdh,SEpith,BareN,BChromatin,NoemN,Mitoses,Class
0,5,4,3,1,2,2,2,3,1,Benign
1,9,1,2,6,4,10,7,7,2,Malignant
2,10,4,7,2,2,8,6,1,1,Malignant
3,6,10,10,10,8,10,7,10,7,Malignant
4,1,1,1,1,2,5,5,1,1,Benign


In [7]:
# check for null value
df.isnull().sum()

Clump            0
UniCell_Size     0
Uni_CellShape    0
MargAdh          0
SEpith           0
BareN            0
BChromatin       0
NoemN            0
Mitoses          0
Class            0
dtype: int64

In [8]:
#statistical information of dataset
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Clump,699.0,4.41774,2.815741,1.0,2.0,4.0,6.0,10.0
UniCell_Size,699.0,3.134478,3.051459,1.0,1.0,1.0,5.0,10.0
Uni_CellShape,699.0,3.207439,2.971913,1.0,1.0,1.0,5.0,10.0
MargAdh,699.0,2.806867,2.855379,1.0,1.0,1.0,4.0,10.0
SEpith,699.0,3.216023,2.2143,1.0,2.0,2.0,4.0,10.0
BareN,699.0,3.533619,3.605543,1.0,1.0,1.0,5.0,10.0
BChromatin,699.0,3.437768,2.438364,1.0,2.0,3.0,5.0,10.0
NoemN,699.0,2.866953,3.053634,1.0,1.0,1.0,4.0,10.0
Mitoses,699.0,1.589413,1.715078,1.0,1.0,1.0,1.0,10.0


In [10]:
# Check the count of target class
df.Class.value_counts()

Benign       458
Malignant    241
Name: Class, dtype: int64

In [11]:
df.Class.value_counts(normalize=True)

Benign       0.655222
Malignant    0.344778
Name: Class, dtype: float64

In [12]:
#info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Clump          699 non-null    int64 
 1   UniCell_Size   699 non-null    int64 
 2   Uni_CellShape  699 non-null    int64 
 3   MargAdh        699 non-null    int64 
 4   SEpith         699 non-null    int64 
 5   BareN          699 non-null    int64 
 6   BChromatin     699 non-null    int64 
 7   NoemN          699 non-null    int64 
 8   Mitoses        699 non-null    int64 
 9   Class          699 non-null    object
dtypes: int64(9), object(1)
memory usage: 54.7+ KB


In [13]:
df.columns

Index(['Clump', 'UniCell_Size', 'Uni_CellShape', 'MargAdh', 'SEpith', 'BareN',
       'BChromatin', 'NoemN', 'Mitoses', 'Class'],
      dtype='object')

### Random Forest

In [18]:
X =df.iloc[:,:-1]

In [20]:
X.columns

Index(['Clump', 'UniCell_Size', 'Uni_CellShape', 'MargAdh', 'SEpith', 'BareN',
       'BChromatin', 'NoemN', 'Mitoses'],
      dtype='object')

In [19]:
from sklearn.preprocessing import LabelEncoder
y=df.iloc[:,-1]

gender_encoder=LabelEncoder()
y=gender_encoder.fit_transform(y)
y

array([0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0,

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size =0.8, random_state = 123)


In [22]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(559, 9)
(140, 9)
(559,)
(140,)


### With default parameter

In [23]:
from sklearn.ensemble import RandomForestClassifier
rf =RandomForestClassifier()
rf.fit(X_train, y_train)   # training

**prediction on test data**

In [24]:
# prediction on test data
print('****** prediction on test data *******')
predict_test = rf.predict(X_test)

# Actual Value : y_test
# prediction : predict_test
print('Confusion Matrix')
print(confusion_matrix(y_test, predict_test))

print('---------------------------------------------------')
print('Classification Report')
print(classification_report(y_test, predict_test))

****** prediction on test data *******
Confusion Matrix
[[88  3]
 [ 2 47]]
---------------------------------------------------
Classification Report
              precision    recall  f1-score   support

           0       0.98      0.97      0.97        91
           1       0.94      0.96      0.95        49

    accuracy                           0.96       140
   macro avg       0.96      0.96      0.96       140
weighted avg       0.96      0.96      0.96       140



**prediction on train data**

In [25]:
# prediction on train data
print('****** prediction on train data *******')
predict_train = rf.predict(X_train)

# Actual Value : y_train
# prediction : predict_train
print('Confusion Matrix')
print(confusion_matrix(y_train, predict_train))

print('---------------------------------------------------')
print('Classification Report')
print(classification_report(y_train, predict_train))

****** prediction on train data *******
Confusion Matrix
[[367   0]
 [  0 192]]
---------------------------------------------------
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       367
           1       1.00      1.00      1.00       192

    accuracy                           1.00       559
   macro avg       1.00      1.00      1.00       559
weighted avg       1.00      1.00      1.00       559



### with Different parameters

In [26]:
from sklearn.ensemble import RandomForestClassifier
rf =RandomForestClassifier(n_estimators= 200, criterion= 'entropy', max_depth=6)
rf.fit(X_train, y_train)   # training

In [27]:
# prediction on test data
print('****** prediction on test data *******')
predict_test = rf.predict(X_test)

# Actual Value : y_test
# prediction : predict_test
print('Confusion Matrix')
print(confusion_matrix(y_test, predict_test))

print('---------------------------------------------------')
print('Classification Report')
print(classification_report(y_test, predict_test))

****** prediction on test data *******
Confusion Matrix
[[87  4]
 [ 1 48]]
---------------------------------------------------
Classification Report
              precision    recall  f1-score   support

           0       0.99      0.96      0.97        91
           1       0.92      0.98      0.95        49

    accuracy                           0.96       140
   macro avg       0.96      0.97      0.96       140
weighted avg       0.97      0.96      0.96       140



In [28]:
# prediction on train data
print('****** prediction on train data *******')
predict_train = rf.predict(X_train)

# Actual Value : y_train
# prediction : predict_train
print('Confusion Matrix')
print(confusion_matrix(y_train, predict_train))

print('---------------------------------------------------')
print('Classification Report')
print(classification_report(y_train, predict_train))

****** prediction on train data *******
Confusion Matrix
[[360   7]
 [  0 192]]
---------------------------------------------------
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       367
           1       0.96      1.00      0.98       192

    accuracy                           0.99       559
   macro avg       0.98      0.99      0.99       559
weighted avg       0.99      0.99      0.99       559



In [29]:
# Pickling of model
import pickle
pickle.dump(rf, open("RF_Breast_Cancer.pkl", 'wb'))

In [31]:
pickled_model=pickle.load(open("RF_Breast_Cancer.pkl", 'rb'))
pickled_model.predict(df.drop('Class',axis=1))

array([0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0,

## End