In [213]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import accuracy_score, brier_score_loss
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate, RandomizedSearchCV, GridSearchCV
# Statistics
from statistics import mean

In [41]:
df = pd.read_csv('C:/Users/lismo/OneDrive/MLprojects/Breast_Cancer_Detection/data/data_raw.csv')

corr_features = ['area_mean', 'radius_mean', 'area_worst', 'radius_worst', 'area_se', 'radius_se', 'texture_mean', 'perimeter_mean', 'texture_mean', 'area_se', 'radius_se', 'concave points_worst', 'concavity_mean']

X = df.drop(columns=['diagnosis', 'id', 'Unnamed: 32'],axis=1)
X2 = X.drop(columns=corr_features,axis=1)
y = df['diagnosis']

In [45]:
num_features = X.select_dtypes(exclude="object").columns
# cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
#oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        #("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features),        
    ]
)


In [47]:
num_features_2 = X2.select_dtypes(exclude="object").columns
# cat_features = X.select_dtypes(include="object").columns

preprocessor_2 = ColumnTransformer(
    [
        #("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features_2),        
    ]
)

In [48]:
X = preprocessor.fit_transform(X)
X2 = preprocessor_2.fit_transform(X2)

In [222]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((455, 30), (114, 30))

In [246]:
rs_space={'max_depth':list(np.arange(1, 50, step=1)) + [None],
              'criterion':['gini','entropy'],
              'min_samples_leaf': np.arange(2, 15, step=1),
              'min_samples_split':np.arange(2, 15, step=1)
         }

rf = DecisionTreeClassifier()
rf_random = GridSearchCV(rf, rs_space, scoring='accuracy', return_train_score=True, n_jobs=-1, cv=10)
model_random = rf_random.fit(X_train, y_train)

In [247]:
model_df =pd.concat([pd.DataFrame(model_random.cv_results_["params"]),
                     pd.DataFrame(model_random.cv_results_["mean_train_score"], columns=["Train Accuracy"]),
                     pd.DataFrame(model_random.cv_results_["mean_test_score"], columns=["Val Accuracy"])],
                     axis=1)

In [248]:
model_df

Unnamed: 0,criterion,max_depth,min_samples_leaf,min_samples_split,Train Accuracy,Val Accuracy
0,gini,1.0,2,2,0.922590,0.883575
1,gini,1.0,2,3,0.922590,0.883575
2,gini,1.0,2,4,0.922590,0.883575
3,gini,1.0,2,5,0.922590,0.883575
4,gini,1.0,2,6,0.922590,0.883575
...,...,...,...,...,...,...
16895,entropy,,14,10,0.959221,0.934203
16896,entropy,,14,11,0.959221,0.936425
16897,entropy,,14,12,0.959221,0.931981
16898,entropy,,14,13,0.959221,0.934203


In [249]:
model_df.loc[[model_random.best_index_]]

Unnamed: 0,criterion,max_depth,min_samples_leaf,min_samples_split,Train Accuracy,Val Accuracy
3888,gini,24.0,2,3,0.989988,0.951787


In [266]:
# Print the best parameters and the corresponding accuracy
print("Best Model Parameters:", model_random.best_params_)
print("Train Mean Accuracy:", model_random.cv_results_['mean_train_score'][model_random.best_index_])
print("Validation Mean Accuracy:", model_random.best_score_)
# Evaluate the best model on the test data
test_accuracy = model_random.score(X_test,y_test)
print("Test Accuracy:", test_accuracy)

Best Model Parameters: {'criterion': 'gini', 'max_depth': 24, 'min_samples_leaf': 2, 'min_samples_split': 3}
Train Mean Accuracy: 0.9899880732303655
Validation Mean Accuracy: 0.9517874396135266
Test Accuracy: 0.956140350877193
