In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Decision tree for classification

In [None]:
df = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
df.describe()

In [None]:
X = df.drop(columns = ['id', 'diagnosis', 'Unnamed: 32'])
y = df['diagnosis']

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1,stratify=y, shuffle=True)

In [None]:
# gini is default (criteria to measure node impurity)
dt = DecisionTreeClassifier(max_depth=2, criterion = 'gini', random_state=1)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"{accuracy} accuracy with max_depth={dt.max_depth} and criterion={dt.criterion}")

In [None]:
# increase max_depth
dt = DecisionTreeClassifier(max_depth=10, criterion = 'gini', random_state=1)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"{accuracy} accuracy with max_depth={dt.max_depth} and criterion={dt.criterion}")

In [None]:
# criterion change to entropy 
dt = DecisionTreeClassifier(max_depth=10, criterion = 'entropy', random_state=1)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"{accuracy} accuracy with max_depth={dt.max_depth} and criterion={dt.criterion}")

In [None]:
# no need for feature scaling with decision trees, accuracy remains same
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=1,stratify=y)
dt = DecisionTreeClassifier(max_depth=10, criterion = 'gini', random_state=1)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"{accuracy} accuracy with max_depth={dt.max_depth} and criterion={dt.criterion}")


# Ensemble learning

In [None]:
# data scaled for better performance of logreg and knn, no need to scale for dt
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Voting classifier
- same training set
- different algorothms

In [None]:
# Import functions to compute accuracy and split data
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Import models, including VotingClassifier meta-model
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import VotingClassifier

In [None]:
# set seed for reproducibility
SEED = 1

# split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=SEED, stratify=y)

# instantiate classifiers
lr = LogisticRegression(random_state=SEED)
knn = KNN()
dt = DecisionTreeClassifier(random_state=SEED)

# define list called classifiers
classifiers = [('Logistic Regression', lr),
              ('K Nearest Neighbors', knn),
              ('Decision Tree', dt)]

# iterate over the defined list of tuples containing the classifiers
for clf_name, clf in classifiers:
    # fit clf to training set
    clf.fit(X_train, y_train)
    # predict labels of test set
    y_pred = clf.predict(X_test)
    # evaluate the accuracy of clf on the test
    print(f"{clf_name} : {accuracy_score(y_test, y_pred)}")

In [None]:
# instantiate a VotingClassifier
vc = VotingClassifier(estimators=classifiers)

vc.fit(X_train, y_train)
y_pred = vc.predict(X_test)

print(f'Voting Classifier: {accuracy_score(y_test, y_pred)}')

## Bagging (bootstrap aggregation)
- one algorithm
- different subsets of the training set
- BaggingClassifier (aggregates predictions by majority voting)
- BaggingRegressor (aggregates predictions through averaging)

In [None]:
from sklearn.ensemble import BaggingClassifier

# instantiate a classification tree 
dt = DecisionTreeClassifier(max_depth=4, min_samples_leaf = 0.16, random_state=SEED)

# Instantiate a BaggingClassifier 'bc'
bc = BaggingClassifier(base_estimator=dt, n_estimators=300, n_jobs=-1)

# fit 'bc' to the training set
bc.fit(X_train, y_train)

# predict test set labels
y_pred = bc.predict(X_test)

print(f'Accuracy of Bagging Classifier: {accuracy_score(y_test, y_pred)}')


## Out of Bag evaluation (OOB)
- on avg, for each model, 63% of training instances sampled
- remaining 37% constitute the OOB instances

In [None]:
# instantiate a classification tree 
dt = DecisionTreeClassifier(max_depth=4, min_samples_leaf = 0.16, random_state=SEED)

# Instantiate a BaggingClassifier 'bc'
bc = BaggingClassifier(base_estimator=dt, n_estimators=300, oob_score=True, n_jobs=-1)

# fit 'bc' to the training set
bc.fit(X_train, y_train)

# predict test set labels
y_pred = bc.predict(X_test)

# Evaluate test set accuracy
test_accuracy = accuracy_score(y_test, y_pred)

# Extract oob accuracy from bc
oob_accuracy = bc.oob_score_

print(f"Test set accuracy: {test_accuracy}")
print(f"OOB accuracy: {oob_accuracy}")
