In [25]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier as RF
import sklearn.model_selection as skm
from sklearn.model_selection import GridSearchCV
from ISLP import confusion_table

In [26]:
#load the dataset
df = pd.read_csv('dataQTM.csv')

#create X and y, drop the empty column
y = df['diagnosis']
X = df.drop(['diagnosis', 'id', 'Unnamed: 32'], axis=1).astype('float64')

#split: trainning size 70% and testing size 30%
X_train, X_test, y_train, y_test = skm.train_test_split(X, y, train_size= 0.7, random_state=123)

In [35]:
#create a parameters grid 
param_grid = {
    'n_estimators': [50, 100, 150, 200, 250],
    'max_features': range(1, 31),
    'min_samples_split': [2, 5, 10, 15]
}

In [36]:
#create a Random Forest classifier
RF_model = RF(random_state=123)

#create the GridSearchCV object
grid_search = GridSearchCV(estimator=RF_model, param_grid=param_grid, cv=5, scoring='accuracy')

#fit the model
grid_search.fit(X_train, y_train)

#get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'max_features': 4, 'min_samples_split': 2, 'n_estimators': 200}


In [40]:
#change B to 0 and M to 1 for y_train and y_test
y_train = y_train.replace({'B': 0, 'M': 1})
y_test = y_test.replace({'B': 0, 'M': 1})

#refit the Random Forest classifier with the best parameters and predict y
RF_model = RF(max_features = 4, min_samples_split = 2, n_estimators = 200, random_state=123).fit(X_train, y_train)
RF_y_pred = RF_model.predict(X_test)

#count the number of misclassified results
misclassified_RF = (RF_y_pred != y_test).sum()

#calculate the misclassification rate
total = len(y_test)
RF_misclassification_rate = misclassified_RF / total
RF_misclassification_rate

0.017543859649122806

In [38]:
#create a confusion table
RF_y_pred = pd.Series(RF_y_pred)
RF_y_pred = RF_y_pred.replace({0:'B', 1:'M'})
y_test= y_test.replace({0:'B', 1:'M'})
confusion_table(RF_y_pred, y_test)

Truth,B,M
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
B,102,2
M,1,66


In [21]:
#print all the important variables
feature_names = list(X_train.columns)
feature_imp = pd.DataFrame({'importance':RF_model.feature_importances_}, index=feature_names)
feature_imp.sort_values(by='importance', ascending=False)

Unnamed: 0,importance
perimeter_worst,0.149067
area_worst,0.138984
radius_worst,0.122468
concave points_worst,0.098459
concave points_mean,0.091233
area_mean,0.055384
perimeter_mean,0.042927
radius_mean,0.042788
concavity_mean,0.039098
area_se,0.034434
