# RF

In [160]:
"""BreastCancerAI (c) by Roi Dvir
BreastCancerAI is licensed under a Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International License.
You should have received a copy of the license along with this work. If not, see <http://creativecommons.org/licenses/by-nc-nd/4.0/>."""


# import libraries
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib.cm import rainbow
%matplotlib inline
import statsmodels.api as sm
import scipy.stats as st
import warnings
warnings.filterwarnings('ignore')

In [161]:
# Sklearn library for implementing Machine Learning models and processing of data
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score

In [162]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [163]:
#importing the dataset
dataset = pd.read_csv('./data/Breast_Cancer_Data_CSV.csv')
dataset.drop(['ID number'],axis=1,inplace=True)
dataset.head()
print("Cancer data set dimensions : {}".format(dataset.shape))
dataset.head()

Cancer data set dimensions : (569, 31)


Unnamed: 0,radius.mean,texture.mean,perimeter.mean,area.mean,smoothness.mean,compactness.mean,concavity.mean,concave points.mean,symmetry.mean,fractal dimension,...,texture.w,perimeter.w,area.w,smoothness.w,compactness.w,concavity.w,concave points.w,symmetry.w,fractal dimension.w,Diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,M
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,M
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,M
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,M
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,M


In [164]:
dataset.head()

Unnamed: 0,radius.mean,texture.mean,perimeter.mean,area.mean,smoothness.mean,compactness.mean,concavity.mean,concave points.mean,symmetry.mean,fractal dimension,...,texture.w,perimeter.w,area.w,smoothness.w,compactness.w,concavity.w,concave points.w,symmetry.w,fractal dimension.w,Diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,M
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,M
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,M
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,M
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,M


In [165]:
#features list

In [166]:
# select features
features=list(dataset.columns[[23, 20, 22, 27, 7, 2, 0, 13, 3, 24]])

features.append(dataset.columns[-1])
dataset = dataset[features]

In [167]:
dataset

Unnamed: 0,area.w,radius.w,perimeter.w,concave points.w,concave points.mean,perimeter.mean,radius.mean,area.std,area.mean,smoothness.w,Diagnosis
0,2019.0,25.380,184.60,0.26540,0.147100,122.80,17.990,153.400,1001.0,0.16220,M
1,1956.0,24.990,158.80,0.18600,0.070170,132.90,20.570,74.080,1326.0,0.12380,M
2,1709.0,23.570,152.50,0.24300,0.127900,130.00,19.690,94.030,1203.0,0.14440,M
3,567.7,14.910,98.87,0.25750,0.105200,77.58,11.420,27.230,386.1,0.20980,M
4,1575.0,22.540,152.20,0.16250,0.104300,135.10,20.290,94.440,1297.0,0.13740,M
5,741.6,15.470,103.40,0.17410,0.080890,82.57,12.450,27.190,477.1,0.17910,M
6,1606.0,22.880,153.20,0.19320,0.074000,119.60,18.250,53.910,1040.0,0.14420,M
7,897.0,17.060,110.60,0.15560,0.059850,90.20,13.710,50.960,577.9,0.16540,M
8,739.3,15.490,106.20,0.20600,0.093530,87.50,13.000,24.320,519.8,0.17030,M
9,711.4,15.090,97.65,0.22100,0.085430,83.97,12.460,23.940,475.9,0.18530,M


In [168]:
def find_features_importance(n):
    clf = RandomForestClassifier(n_estimators=100, max_features=10)
    clf = clf.fit(X_train, y_train)
    importances = clf.feature_importances_
    idx = (-importances).argsort()[:n]
    return( idx )

In [169]:
# split dataframe into two based on diagnosis
dfM=dataset[dataset['Diagnosis'] ==1]
dfB=dataset[dataset['Diagnosis'] ==0]
dataset1 = shuffle(dataset)
X = dataset1.iloc[:, :-1].values
Y = dataset1.iloc[:, -1].values
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)
#Encoding categorical data values 
# split our dataset into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size =     0.3, random_state = 0)
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [170]:
find_features_importance(10)

array([2, 3, 0, 1, 4, 9, 7, 8, 6, 5])

In [171]:
def values_confusion_matrix(y_test,y_pred):
    cm=confusion_matrix(y_test,y_pred)
    conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
    TN=cm[0,0]
    TP=cm[1,1]
    FN=cm[1,0]
    FP=cm[0,1]
    Accuracy=round((TN+TP)/float(TN+TP+FN+FP),3)
    Misclassification=1-Accuracy
    Sensitivity=round(TP/(float(TP+FN)),3)
    Specifity=round(TN/(float(TN+cm[0,1])),3)
    not_detected=round(FN/(FN+TN),5)
    a=np.array([Accuracy,Sensitivity,Specifity, not_detected])
    return(a)

In [172]:
def draw_confusion_matrix(y_test,y_pred):
    cm=confusion_matrix(y_test,y_pred)
    conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
    TN=cm[0,0]
    TP=cm[1,1]
    FN=cm[1,0]
    FP=cm[0,1]
    Accuracy=round((TN+TP)/float(TN+TP+FN+FP),3)
    Misclassification=1-Accuracy
    Sensitivity=round(TP/(float(TP+FN)),3)
    Specifity=round(TN/(float(TN+cm[0,1])),3)
    not_detected=round(FN/(FN+TN),5)
    print ('Accuracy=',Accuracy,'Sensitivity=', Sensitivity, 'Specifity=' ,Specifity,' notdetected=' , not_detected)
    plt.figure(figsize = (8,5))
    sn.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")

In [173]:
def rf_model():
    rf_classifier = RandomForestClassifier(n_estimators=100, max_features=10)
    rf=rf_classifier
    rf.fit(X_train, y_train)
    y_pred=rf.predict(X_test)
    print("score=",rf.score(X_test, y_test) )
    draw_confusion_matrix(y_test,y_pred)

In [174]:
def rf_model_1():
    rf_classifier = RandomForestClassifier(n_estimators=100, max_features=10)
    rf=rf_classifier
    rf.fit(X_train, y_train)
    y_pred=rf.predict(X_test)
    return(y_pred)

In [175]:
def rf_model_2():
    rf_classifier = RandomForestClassifier(n_estimators=100, max_features=10)
    rf=rf_classifier
    rf.fit(X_train, y_train)
    y_pred=rf.predict_proba(X_test)
    return( y_pred)

In [176]:
rf_model_2()

array([[1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.  , 1.  ],
       [0.98, 0.02],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.  , 1.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.1 , 0.9 ],
       [0.  , 1.  ],
       [0.94, 0.06],
       [0.42, 0.58],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.93, 0.07],
       [0.  , 1.  ],
       [0.99, 0.01],
       [0.  , 1.  ],
       [1.  , 0.  ],
       [0.  , 1.  ],
       [1.  , 0.  ],
       [0.  , 1.  ],
       [0.78, 0.22],
       [1.  , 0.  ],
       [0.  , 1.  ],
       [0.06, 0.94],
       [0.  , 1.  ],
       [0.83, 0.17],
       [1.  , 0.  ],
       [0.91, 0.09],
       [0.  , 1.  ],
       [1.  , 0.  ],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.9 , 0.1 ],
       [0.02, 0.98],
       [1.  ,

In [177]:
def cross_valid_score(n):
    clf =  RandomForestClassifier(n_estimators=100, max_features=10)
    scores = cross_val_score(clf, X, Y, cv=n)
    print('average=',np.average(scores),'std=', np.std(scores))

In [178]:
cross_valid_score(5)

average= 0.9437322046941132 std= 0.017422629244133223


In [179]:
para=np.zeros([1000,4])
i=0
for i in range (1000):
    dataset1 = shuffle(dataset)
    X = dataset1.iloc[:, :-1].values
    Y = dataset1.iloc[:, -1].values
    labelencoder_Y = LabelEncoder()
    Y = labelencoder_Y.fit_transform(Y)
    #Encoding categorical data values 
    # split our dataset into training and testing datasets
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size =     0.3,     random_state = 0)
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    y_pred=rf_model_1()
    a=values_confusion_matrix(y_test,y_pred)
    para[i,:]=a
    i=i+1

print(np.average(para, axis=0))
print(np.std(para, axis=0))

[0.94058  0.913552 0.956707 0.050902]
[0.01570241 0.03517626 0.02044185 0.02074324]
