# Examples

In [32]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

data = load_breast_cancer()
features = data["data"]
labels = data["target"]
feature_names = data["feature_names"]
label_names = data["target_names"]
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=42)

## Chapter 1

### 1.1 Breast Cancer Wisconsin (Diagnostic) data set

In [64]:
BreastCancerFeatures = pd.DataFrame(data = features, columns = feature_names)
BreastCancerLabels = pd.Series(data = labels).map(lambda x: {0: "malignant", 1: "benign"}[x])

In [None]:
BreastCancerFeatures.info()

In [None]:
BreastCancerFeatures.describe()

In [65]:
BreastCancerLabels.value_counts()

benign       357
malignant    212
dtype: int64

In [68]:
data

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

### 1.2 Logistic Regression

In [26]:
from sklearn.linear_model  import LogisticRegression

clf = LogisticRegression(random_state = 0, max_iter = 300, n_jobs = -1).fit(X_train, y_train)

print("Accuracy Logistic Regression: {} %".format(round(clf.score(X_test, y_test)*100, 2)))

Accuracy: 95.74 %


## Chapter 2

### 2.1 Decision Tree Classifier

In [27]:
from sklearn import tree

clf = tree.DecisionTreeClassifier(random_state = 0).fit(X_train, y_train)

print("Accuracy Decision Tree: {} %".format(round(clf.score(X_test, y_test)*100, 2)))

Accuracy Decision Tree: 91.49 %


##### 2.1.1 Visualize Decision Tree

In [None]:
fig = plt.figure(figsize=(16,10), edgecolor="black",dpi=500)
tree.plot_tree(clf, feature_names=feature_names,  
                       class_names=label_names,
                       filled=True,
                       max_depth=7,
                       fontsize=5)
plt.savefig("Decision Tree - Breast Cancer - Unpruned.png")

#### 2.2 Cost Complexity Pruning

In [35]:
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

##### 2.2.1 Alpha vs. Impurity

In [None]:
fig, ax = plt.subplots(dpi = 100)
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
plt.savefig("Alpha versus Impurity.png")

##### 2.2.2 Number of Nodes vs. Alpha

In [44]:
clfs = []
for ccp_alpha in ccp_alphas:
    clf = tree.DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)

In [None]:
# Last Tree only contains the root and is therefore trivial.
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

node_counts = [clf.tree_.node_count for clf in clfs]
fig, ax = plt.subplots(dpi = 100)
ax.plot(ccp_alphas, node_counts, marker='o', drawstyle="steps-post")
ax.set_xlabel("alpha")
ax.set_ylabel("number of nodes")
ax.set_title("Number of nodes vs alpha")
plt.savefig("Number of Nodes vs Alpha.png")

##### 2.2.3 Accuracy vs alpha for training and testing sets

In [None]:
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]

fig, ax = plt.subplots(dpi = 100)
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train",
        drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label="test",
        drawstyle="steps-post")
ax.legend()
plt.savefig("Accuracy vs Alpha - Training and Testing set.png")

#### 2.2 Random Forest