Load in data and functionality

In [11]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import matplotlib.pylab as plt
from dmba import plotDecisionTree, classificationSummary, regressionSummary
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split


from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
breast_cancer_wisconsin_original = fetch_ucirepo(id=15) 
  
# data (as pandas dataframes) 
x = breast_cancer_wisconsin_original.data.features 
y = breast_cancer_wisconsin_original.data.targets 
  
# metadata 
print(breast_cancer_wisconsin_original.metadata) 
  
# variable information 
print(breast_cancer_wisconsin_original.variables) 


{'uci_id': 15, 'name': 'Breast Cancer Wisconsin (Original)', 'repository_url': 'https://archive.ics.uci.edu/dataset/15/breast+cancer+wisconsin+original', 'data_url': 'https://archive.ics.uci.edu/static/public/15/data.csv', 'abstract': 'Original Wisconsin Breast Cancer Database', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 699, 'num_features': 9, 'feature_types': ['Integer'], 'demographics': [], 'target_col': ['Class'], 'index_col': ['Sample_code_number'], 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1990, 'last_updated': 'Sun Mar 10 2024', 'dataset_doi': '10.24432/C5HP4Z', 'creators': ['WIlliam Wolberg'], 'intro_paper': None, 'additional_info': {'summary': "Samples arrive periodically as Dr. Wolberg reports his clinical cases. The database therefore reflects this chronological grouping of the data. This grouping information appears immediately below, having been removed fro

In [3]:
from sklearn.impute import KNNImputer
data_c = x.copy()
imputer = KNNImputer(n_neighbors=25)
data_imputed = imputer.fit_transform(data_c)
data_imputed=pd.DataFrame(data_imputed,columns=data_c.columns)
data_imputed.isna().sum()

Clump_thickness                0
Uniformity_of_cell_size        0
Uniformity_of_cell_shape       0
Marginal_adhesion              0
Single_epithelial_cell_size    0
Bare_nuclei                    0
Bland_chromatin                0
Normal_nucleoli                0
Mitoses                        0
dtype: int64

In [4]:
x=data_imputed

Split data

In [5]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25, random_state=42)

Build and train the Decistion tree model

In [7]:
treeClassifier = DecisionTreeClassifier(random_state = 42)
treeClassifier.fit(x_train,y_train)

Evaluate the model

In [14]:
train_pred = treeClassifier.predict(x_train)
valid_pred = treeClassifier.predict(x_test)

print("Training Accuracy:", accuracy_score(y_train, train_pred))
print("Validation Accuracy:", accuracy_score(y_test, valid_pred))

print("Training Classification Report:\n", classification_report(y_train, train_pred))
print("Validation Classification Report:\n", classification_report(y_test, valid_pred))

print("Training Confusion Matrix:\n", confusion_matrix(y_train, train_pred))
print("Validation Confusion Matrix:\n", confusion_matrix(y_test, valid_pred))

Training Accuracy: 1.0
Validation Accuracy: 0.9542857142857143
Training Classification Report:
               precision    recall  f1-score   support

           2       1.00      1.00      1.00       340
           4       1.00      1.00      1.00       184

    accuracy                           1.00       524
   macro avg       1.00      1.00      1.00       524
weighted avg       1.00      1.00      1.00       524

Validation Classification Report:
               precision    recall  f1-score   support

           2       0.97      0.97      0.97       118
           4       0.93      0.93      0.93        57

    accuracy                           0.95       175
   macro avg       0.95      0.95      0.95       175
weighted avg       0.95      0.95      0.95       175

Training Confusion Matrix:
 [[340   0]
 [  0 184]]
Validation Confusion Matrix:
 [[114   4]
 [  4  53]]
