# Fundamentals of Machine Learning (CSCI-UA.473)

## Lab 3: Margin Classifiers and Decision Trees

In [None]:
# Load basic packages
import numpy as np
import pandas as pd
# Install autograd:
#!conda install -c conda-forge autograd

import autograd.numpy as numpy
import autograd.numpy.random as npr

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn import metrics
from sklearn import model_selection
from sklearn import tree

# Import and load dataset for this exercise - pip install palmerpenguins
from palmerpenguins import load_penguins
# This function returns a pandas dataframe by default (use return_X_y to get it in two numpy arrays)
penguins = load_penguins().dropna()
X = penguins[['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g']]
y = penguins['species']
print(X.shape, y.shape)
X.head()



### Split the data into train and test

We'll use a 80/20 split for our training/test sets. We will not touch the test set. 

In [None]:
# Split the data.  DO NOT TOUCH THE TEST DATA FROM HERE ON!!
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y, test_size = 0.2) # 0.2 is 20% test data.


## Part I : Support Vector Machines
We will now play around with the support vector machine. We will first compare them to a standard logistic regression model. Then we will see how they work on datasets which are not linearly separable.

In [None]:
# Start by importing the packages we'll need.
from sklearn.svm import LinearSVC

%matplotlib inline

In [None]:
# Train the linear SVM.

svm = LinearSVC(C = 1e10, dual = False) # Uses the squared-hinge loss function when fitting the model.
svm.fit(X_train, y_train)

In [None]:
# Now evaluate it on the test points.
y_pred = svm.predict(X_test)

acc = metrics.accuracy_score(y_test, y_pred)
print('Linear SVM validation accuracy = {:0.1f}%'.format(100*acc))

### Case of non-linearly separable dataset

If the data is linearly separable, then a linear SVM should be able to achieve nearly 100% accuracy, as we saw with the penguins dataset. We'll use a synthetic dataset to illustrate when this does not happen and mention some techniques to handle it. This data is drawn from a bi-modal Gaussian mixture model.

In [None]:
"""
Input:
    N : the number of data points

Output:
    X, y : the features and targets of shapes (N,2) and (N, )
"""
def sample_bimodal_data(N, var1=1, var2=1):
    
    # The two modes and covariances.
    mu1 = np.asarray([2, -1])
    mu2 = np.asarray([-2, 1])
    
    cov1 = var1 * np.identity(2)
    cov2 = var2 * np.identity(2)
    
    N1 = N//2   # Number of points in first class.
    N2 = N - N1 # Number of points in second class.
    
    # Sample the random points.
    X1 = np.random.multivariate_normal(mu1, cov1, N1)
    X2 = np.random.multivariate_normal(mu2, cov2, N2)
    Y1 = np.zeros(N1)
    Y2 = np.ones(N2)
    
    # Combine the data.
    X = np.vstack((X1, X2))
    Y = np.concatenate((Y1, Y2), axis = None)
    
    return X,Y

In [None]:
# Plot the sample data.
N = 500
X,Y = sample_bimodal_data(N, var1=2,var2=1)

plt.figure(1)
plt.scatter(X[:N//2, 0], X[:N//2, 1], label = 'Class 0')
plt.scatter(X[N - N//2:, 0], X[N - N//2:, 1], label = 'Class 1')
plt.legend()
plt.xlabel(r'$x_1$')
plt.ylabel(r'$x_2$')
plt.title('Sample Data');

Increasing the factor in front of the covariances or shifting the centers of the two distributions to be closer to each other will cause the data to overlap more, making it harder to classify. Lets try that! 

#### Using a slack variable C

Since the data is not perfectly linearly separable you'll want to use a slack variable which allows SVM to handle this dataset.  Let's train some models with different values of $C$ and compare them using cross-validation.

In [None]:
# First get the data and split it into training and testing.
# Use a 70/30 split, normally we would want 3 splits (including the validation), but in this example we will use sklearn's cross_val_score function to tune our hyperparameters
# This means that we are doing k-fold leave one out cross validation and different subsets of our training set will be used as validation sets.
Xs_train, Xs_test, Ys_train, Ys_test = model_selection.train_test_split(X, Y, test_size = 0.20, random_state = 981)

In [None]:
# Define the SVM model to use with a slack variable, remember that C here controls the "inverse" regularization strength since by decreasing
# C we allow more points to lie beyond the correct margin.
svm = LinearSVC(C = 1e10, dual = False)
svm.fit(Xs_train, Ys_train)
svmpred = svm.predict(Xs_test)
acc = metrics.accuracy_score(Ys_test, svmpred)
print('SVM accuracy = {:0.1f}%'.format(100*acc))
fig, axs = plt.subplots(2, figsize=(10,10))

# Select indices with certain class, this is useful while indexing from larger arrays
I = Ys_test == 0
axs[0].scatter(Xs_test[I, 0], Xs_test[I, 1], label = 'Actual class 0')
I = Ys_test == 1
axs[0].scatter(Xs_test[I, 0], Xs_test[I, 1], label = 'Actual class 1')
axs[0].legend()
I = svmpred == 0
axs[1].scatter(Xs_test[I, 0], Xs_test[I, 1], label = 'Predicted class 0')
I = svmpred == 1
axs[1].scatter(Xs_test[I, 0], Xs_test[I, 1], label = 'Predicted class 1')


plt.legend()
plt.show()

**Experiment with various different mu1 values and demonstrate SVM accuracy gets worse as mu1 and mu2 get closer**


Let's train some models with different $C$ and compare them use cross-validation.

In [None]:
# Define the different SVM models to use
svm_1 = LinearSVC(C = 10, dual = False)
svm_2 = LinearSVC(C = 1, dual = False)
svm_3 = LinearSVC(C = 1e-3, dual = False)
svm_4 = LinearSVC(C = 1e-7, dual = False)

split = model_selection.KFold(5)
# Get the CV scores.
cv_1 = model_selection.cross_val_score(svm_1, Xs_train, Ys_train, cv = split)
cv_2 = model_selection.cross_val_score(svm_2, Xs_train, Ys_train, cv = split)
cv_3 = model_selection.cross_val_score(svm_3, Xs_train, Ys_train, cv = split)
cv_4 = model_selection.cross_val_score(svm_4, Xs_train, Ys_train, cv = split)

# Print the average scores.
print('C = 10    CV average score = {:0.1f}%'.format(np.mean(cv_1) * 100))
print('C = 1     CV average score = {:0.1f}%'.format(np.mean(cv_2) * 100))
print('C = 1e-3  CV average score = {:0.1f}%'.format(np.mean(cv_3) * 100))
print('C = 1e-7  CV average score = {:0.1f}%'.format(np.mean(cv_4) * 100))

We see that the model performs slightly differently for different values of the slack variable $C$.  

$$
\min_{w,b,\zeta} \frac{1}{2}w^Tw + C\sum_{i=1}^n \zeta_i,\quad \text{ such that }\quad y_i(w^Tx_i + b) \ge 1 - \zeta_i,\quad \zeta_i \ge 0
$$

See the sci-kit [documentation](https://scikit-learn.org/stable/modules/svm.html) for more details.  We can also plot a curve of the validation score for many different $C$ values which can be helpful for determining the optimal hyperparameter.

In [None]:
# Get the C values we want to look at.
C = 1/(2**np.arange(0, 20)) # 1,...,1e-6

k = 10 # Kfold CV.
cv_scores = np.zeros(len(C))
split = model_selection.KFold(k)
for i in range(len(C)):
    svm = LinearSVC(C = C[i], dual = False)
    cv_scores[i] = np.mean(model_selection.cross_val_score(svm, Xs_train, Ys_train, cv = split))

plt.figure(2)
plt.semilogx(C, cv_scores, 'b-x')
plt.xlabel(r'$C$')
plt.ylabel(r'Score')
plt.title(r'{:d}-Fold CV Score for Linear SVM'.format(k))
plt.grid();

We can use this plot to find the optimal value of the slack variables based on the cross validation score. Now let's see how our 4 models from earlier actually do on the validation set.

In [None]:
# Define the different SVM models to use
svm_1 = LinearSVC(C = 10, dual = False)
svm_2 = LinearSVC(C = 1, dual = False)
svm_3 = LinearSVC(C = 1e-3, dual = False)
svm_4 = LinearSVC(C = 1e-7, dual = False)

# Fit the models.
svm_1.fit(Xs_train, Ys_train)
svm_2.fit(Xs_train, Ys_train)
svm_3.fit(Xs_train, Ys_train)
svm_4.fit(Xs_train, Ys_train)

# Make the predictions.
pred1 = svm_1.predict(Xs_test)
pred2 = svm_2.predict(Xs_test)
pred3 = svm_3.predict(Xs_test)
pred4 = svm_4.predict(Xs_test)

# Evaluate the models.
acc1 = metrics.accuracy_score(Ys_test, pred1)
acc2 = metrics.accuracy_score(Ys_test, pred2)
acc3 = metrics.accuracy_score(Ys_test, pred3)
acc4 = metrics.accuracy_score(Ys_test, pred4)

print('Linear SVM (C = 10)   accuracy = {:0.1f}%'.format(100*acc1))
print('Linear SVM (C = 1)    accuracy = {:0.1f}%'.format(100*acc2))
print('Linear SVM (C = 1e-3) accuracy = {:0.1f}%'.format(100*acc3))
print('Linear SVM (C = 1e-7) accuracy = {:0.1f}%'.format(100*acc4))

### Another non-linearly separable dataset

In [None]:
"""
Input:
    N : the number of data points

Output:
    X, y : the features and targets of shapes (N,2) and (N, )
"""
def gen_data1(N):
    N1 = N//2
    N2 = N - N1
    t = np.linspace(0, 2*np.pi, N1)
    
    X1 = np.zeros((N1, 2))
    X1[:,0] = 4*np.cos(t) + 0.1*np.random.randn(N1)
    X1[:,1] = 4*np.sin(t) + 0.1*np.random.randn(N1)
    y1 = np.zeros(N1)
    
    X2 = np.random.randn(2*N2)
    X2 = X2.reshape((N2, 2))
    y2 = np.ones(N2)

    # Combine the data.
    X = np.vstack((X1, X2))
    y = np.concatenate((y1, y2), axis = None) # axis = None means that arrays flattened before use
    
    return X,y

In [None]:
# Plot the data.
N = 1000
Xs, Ys = gen_data1(N)

plt.figure(3)
plt.scatter(Xs[:N//2, 0], Xs[:N//2, 1], label = 'Class 0')
plt.scatter(Xs[N - N//2:, 0], Xs[N - N//2:, 1], label = 'Class 1')
plt.legend()
plt.xlabel(r'$x_1$')
plt.ylabel(r'$x_2$')
plt.title('Sample Data');

In [None]:
# Define the different SVM models to use
# Use a 70/30 split, we are not performing any validation steps here
Xs_train, Xs_test, Ys_train, Ys_test = model_selection.train_test_split(Xs, Ys, test_size = 0.3, random_state = 981)
svm = LinearSVC(C = 1e10, dual = False)
svm.fit(Xs_train, Ys_train)
svmpred = svm.predict(Xs_test)
acc = metrics.accuracy_score(Ys_test, svmpred)
print('SVM accuracy = {:0.1f}%'.format(100*acc))

fig, axs = plt.subplots(2, figsize=(10,10))

# Select indices with certain class, this is useful while indexing from larger arrays
I = Ys_test == 0
axs[0].scatter(Xs_test[I, 0], Xs_test[I, 1], label = 'Actual class 0')
I = Ys_test == 1
axs[0].scatter(Xs_test[I, 0], Xs_test[I, 1], label = 'Actual class 1')
axs[0].legend()
I = svmpred == 0
axs[1].scatter(Xs_test[I, 0], Xs_test[I, 1], label = 'predicted class 0')
I = svmpred == 1
axs[1].scatter(Xs_test[I, 0], Xs_test[I, 1], label = 'prediced class 1')
plt.legend()
plt.show()

In [None]:
def cart2pol(x, y):
    rho = np.sqrt(x**2 + y**2)
    phi = np.arctan2(y, x)
    return (rho, phi)

pX = np.vstack(cart2pol(Xs[:, 0], Xs[:, 1])).T
print(pX.shape)
plt.figure(4)
plt.scatter(pX[:N//2, 0], pX[:N//2, 1], label = 'Class 0')
plt.scatter(pX[N - N//2:, 0], pX[N - N//2:, 1], label = 'Class 1')
plt.legend()
plt.xlabel(r'$p_1$ (radius)')
plt.ylabel(r'$p_2$ (angle)')
plt.title('Sample Data')
plt.show()

In [None]:
# Define the different SVM models to use
# Use a 70/30 split
Xs_train, Xs_val, Ys_train, Ys_val = model_selection.train_test_split(pX, Ys, test_size = 0.3, random_state = 981)
svm = LinearSVC(C = 1e+10, dual = False)
svm.fit(Xs_train, Ys_train)
svmpred = svm.predict(Xs_val)
acc = metrics.accuracy_score(Ys_val, svmpred)
print('SVM accuracy = {:0.1f}%'.format(100*acc))

## Part II : Trees - Decisions Trees, Random Forests and Adaboost

In [None]:
from sklearn import tree

# Import and load dataset for this exercise - pip install palmerpenguins
from palmerpenguins import load_penguins
# This function returns a pandas dataframe by default (use return_X_y to get it in two numpy arrays)
penguins = load_penguins().dropna()
X = penguins[['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g']]
y = penguins['species']
print(X.shape, y.shape)
X.head()

In [None]:
# Initialize a simple decision tree classifier with splitting using gini criterion
clf = tree.DecisionTreeClassifier(criterion='gini')

# Fit our penguins data on this, rememeber we had already split this dataset at the begining and should use the same splits across methods
# if we want to compare the performance.
clf = clf.fit(X_train, y_train)


In [None]:
# Predict using the fitted decision tree
preds = clf.predict(X_test)
print(np.sum(preds == y_test)/len(preds))

In [None]:
# Initialize a simple decision tree classifier with splitting using entropy criterion
clf = tree.DecisionTreeClassifier(criterion='entropy')

# Fit our penguins data on this, rememeber we had already split this dataset at the begining and should use the same splits across methods
# if we want to compare the performance.
clf = clf.fit(X_train, y_train)


In [None]:
# Predict using the fitted decision tree
preds = clf.predict(X_test)
print(np.sum(preds == y_test)/len(preds))

In [None]:
# This was your vanilla Decision trees, now lets look at Bagging.
from sklearn.ensemble import BaggingClassifier

# Remember, Bagging is just using an ensemble of decision trees to add more variance to your model.
# So we simply wrap our original DecisionTreeClassifier with a BaggingClassifier module.
clf = BaggingClassifier(estimator=tree.DecisionTreeClassifier(criterion='gini'),
                       n_estimators=100, max_samples=1.0, max_features=0.5,bootstrap=True)
clf = clf.fit(X_train, y_train)
# What would setting n_estimators as 1 mean?
# Changing the value for n_estimators changes our accuracy


In [None]:
preds = clf.predict(X_test)
print(np.sum(preds == y_test)/len(preds))

In [None]:
# Random forests
from sklearn.ensemble import RandomForestClassifier

# Now we try RandomForests on the same data. Again, RandomForests is just a method to ensemble your base models
# and will be used in the same way bagging was. The difference is in the number of features being selected to make a node split
clf = RandomForestClassifier(n_estimators=100, max_samples=0.1, max_features=0.5,bootstrap=True, criterion='gini')
clf.fit(X_train, y_train)

In [None]:
preds = clf.predict(X_test)
print(np.sum(preds == y_test)/len(preds))

In [None]:
# Next we implement gradient boosting, in particular the Adaboost algorithm.
# Remember, gradient boosting algorithms involve iteratively improving the decision trees
# and hence involve a learning rate similar to logistic regressions.
from sklearn.ensemble import AdaBoostClassifier

In [None]:
# Create and fit an AdaBoosted decision tree
bdt = AdaBoostClassifier(
    tree.DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=100, learning_rate=1
)
bdt.fit(X_train, y_train)

In [None]:
preds = bdt.predict(X_test)
print(np.sum(preds == y_test)/len(preds))

### Palmer penguins is a rather simple toy dataset, lets try the decision tree on a more meaningful one - images of handwritten digits

In [None]:
from sklearn.datasets import load_digits
import graphviz # Use pip install graphviz

# We use the digits dataset provided by sklearn, this is similar to MNIST but much coarser (8x8) and 
# thus a lot lighter than MNIST (28x28)
dataset = load_digits()
X, Y = dataset.data, dataset.target
idxs = np.arange(0, len(X))
np.random.shuffle(idxs)
train_idxs,test_idxs = idxs[:1500], idxs[1500:]
train_X, train_Y = X[train_idxs], Y[train_idxs]
test_X, test_Y = X[test_idxs], Y[test_idxs]
plt.gray()
plt.matshow(dataset.images[101])
print(dataset.target[101])

In [None]:
clf = tree.DecisionTreeClassifier(criterion='gini')
clf = clf.fit(train_X, train_Y)
labels_str = [str(item) for item in dataset.target_names.tolist()]
dot_data = tree.export_graphviz(clf, out_file='graph.dot', 
                      feature_names=dataset.feature_names,  
                      class_names=labels_str,  
                      filled=True, rounded=True,  
                      special_characters=True) 
graph = graphviz.Source(dot_data)
!dot -Tpng graph.dot -o graph.png

In [None]:
# Predict using the fitted decision tree
preds = clf.predict(test_X)

In [None]:
print(np.sum(preds == test_Y)/len(preds))

### Bagging Classifiers
The BaggingClassifier class in sklearn takes as argument a base estimator (we are use DecisionTrees), and 'n_estimators' - the number of such base estimators. This is a crucial hyperparameter that controls the strength of ensembling effect. Additionally, it takes in 'max_samples' which denotes the number of samples subsampled with replacement (if bootstrap is True) for each of the base estimators.

In [None]:
# Now using Bagging.
from sklearn.ensemble import BaggingClassifier

# Remember, Bagging is just using an ensemble of decision trees to add more variance to your model.
# So we simply wrap our original DecisionTreeClassifier with a BaggingClassifier module.
clf = BaggingClassifier(estimator=tree.DecisionTreeClassifier(criterion='gini'),
                       n_estimators=100, max_samples=1.0, bootstrap=True)
clf = clf.fit(train_X, train_Y)
# What would setting n_estimators as 1 mean?
# Changing the value for n_estimators changes our accuracy


In [None]:
preds = clf.predict(test_X)

In [None]:
print(np.sum(preds == test_Y)/len(preds))

### Random Forests
Similar to the BaggingClassifier, the RandomForestClassifier uses the hyperparameter 'n_estimators' for the number of base estimators, however for Random Forests the base estimator is constrained to be a decision tree. It uses two more crucial hyperparameters, 'max_samples' denoting the number of samples randomly drawn with replacement (again if bootstrap is True) for each tree and 'max_features' which denotes the number of features to be drawn randomly when performing each node split.

In [None]:
# Random forests
from sklearn.ensemble import RandomForestClassifier

# Now we try RandomForests on the same data. Again, RandomForests is just a method to ensemble your base models
# and will be used in the same way bagging was
clf = RandomForestClassifier(n_estimators=100, max_samples=0.5, max_features=0.5,bootstrap=True, criterion='gini')
clf.fit(train_X, train_Y)


In [None]:
preds = clf.predict(test_X)

In [None]:
print(np.sum(preds == test_Y)/len(preds))

### Boosting - Adaboost
An AdaBoost classifier is a estimator that begins by fitting a classifier on the original dataset and then fits additional copies of the classifier on the same dataset but where the weights of incorrectly classified instances are adjusted such that subsequent classifiers focus more on difficult cases. There are three hyperparameters to note, the base_estimator used (DecisionTree below), the number of estimators (n_estimators) and 'learning_rate' which controls the weight applied to each boosting iteration

In [None]:
# Create and fit an AdaBoosted decision tree
bdt = AdaBoostClassifier(
    tree.DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=2000, learning_rate=1
)
bdt.fit(X_train, y_train)

In [None]:
preds = bdt.predict(X_test)
print(np.sum(preds == y_test)/len(preds))

### Now we take a closer look at how the hyperparameters of each tree algorithm affects its decision boundaries

In [None]:
# Remember, gradient boosting algorithms involve iteratively improving the decision trees
# and hence involve a learning rate similar to logistic regressions.
import matplotlib.pyplot as plt

from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_gaussian_quantiles

# Construct dataset
X1, y1 = make_gaussian_quantiles(
    cov=2.0, n_samples=200, n_features=2, n_classes=2, random_state=1
)
X2, y2 = make_gaussian_quantiles(
    mean=(3, 3), cov=1.5, n_samples=300, n_features=2, n_classes=2, random_state=1
)
X = np.concatenate((X1, X2))
y = np.concatenate((y1, -y2+1))


In [None]:
# Plot the training points
plot_colors = "br"
plot_step = 0.02
class_names = "AB"
for i, n, c in zip(range(2), class_names, plot_colors):
    idx = np.where(y == i)
    plt.scatter(
        X[idx, 0],
        X[idx, 1],
        c=c,
        cmap=plt.cm.Paired,
        s=20,
        edgecolor="k",
        label="Class %s" % n,
    )
    plt.xlabel(r'$X_0$')
    plt.ylabel(r'$X_1$')

In [None]:
# Create and fit an AdaBoosted decision tree
bdt = AdaBoostClassifier(
    tree.DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=100, learning_rate=1
)
bdt.fit(X, y)

# Create and fit a Random forest 
clf_rf = RandomForestClassifier(n_estimators=100, max_samples=0.5, max_features=0.5,bootstrap=True, criterion='gini')
clf_rf = clf_rf.fit(X, y)

clf_bg = BaggingClassifier(estimator=tree.DecisionTreeClassifier(criterion='gini'),
                       n_estimators=100, max_samples=1.0, bootstrap=True)
clf_bg = clf_bg.fit(X, y)

clf = tree.DecisionTreeClassifier(criterion='gini')
clf = clf.fit(X, y)

In [None]:
# Initialize the plots
plt.figure(figsize=(15, 10))
classifiers = [clf, clf_bg, clf_rf, bdt]
names = ["Vanilla Decision Tree", "Bagging", "Random Forest", "Adaboost"]
for i, clf in enumerate(classifiers):
    plt.subplot(2,2,i+1)
    plt.title(names[i])
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(
        np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)
    )

    # Make predictions using fitted tree
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    # Plot the decision boundary
    cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
    plt.axis("tight")

    # Plot the training points
    for j, n, c in zip(range(2), class_names, plot_colors):
        idx = np.where(y == j)
        plt.scatter(
            X[idx, 0],
            X[idx, 1],
            c=c,
            cmap=plt.cm.Paired,
            s=20,
            edgecolor="k",
            label="Class %s" % n,
        )
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.legend(loc="upper right")
    plt.xlabel("x")
    plt.ylabel("y")
    
