## Support Vector Machines
### Simple Classification on the Iris Dataset

In [39]:
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

In [40]:
iris = datasets.load_iris()
X = iris["data"][:, (2, 3)] # petal length, petal width
y = (iris["target"] == 2).astype(np.float64) # Iris-Virginica

In [41]:
std_scaler = StandardScaler()
X_scaled = std_scaler.fit_transform(X)

In [42]:
svm_clf = LinearSVC(C=1, loss="hinge", random_state=42)
svm_clf.fit(X_scaled, y)

LinearSVC(C=1, loss='hinge', random_state=42)

In [43]:
preds = svm_clf.predict(X_scaled)

In [45]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, preds)

array([[98,  2],
       [ 4, 46]], dtype=int64)

### Looking at the full iris dataset

In [48]:
X = iris["data"]
X_scaled = std_scaler.fit_transform(X)
svm_clf.fit(X_scaled, y)
preds = svm_clf.predict(X_scaled)
confusion_matrix(y, preds)

array([[97,  3],
       [ 1, 49]], dtype=int64)

### Classifying non-linear data using polynomial features

### Lecture slides reference
```
from sklearn.preprocessing import PolynomialFeatures
from sklearn.datasets import make_moons
import matplotlib.pyplot as plt

# Generate some data
X, y = make_moons(n_samples=100, noise=0.15, random_state=42)

# Add in a 3rd degree polynomial set of features
poly_features = PolynomialFeatures(degree=3, include_bias=False)
X_poly = poly_features.fit_transform(X)

# Scale the data
X_poly_scaled = std_scaler.fit_transform(X_poly)

# Create a svm classifier and fit the data
poly_svm_clf = LinearSVC(C=10, loss="hinge", random_state=42)
poly_svm_clf.fit(X_poly_scaled, y)

```

In [52]:
from sklearn.preprocessing import PolynomialFeatures

poly_features = PolynomialFeatures(degree=4, include_bias=False) # 4th seems to give best results...

X_poly = poly_features.fit_transform(X)
X_poly_scaled = std_scaler.fit_transform(X_poly)

poly_svm_clf = LinearSVC(C=10, loss="hinge", random_state=42) # Can also investigate values of C
poly_svm_clf.fit(X_poly_scaled, y)

polypreds = poly_svm_clf.predict(X_poly_scaled)
confusion_matrix(y, polypreds)



array([[99,  1],
       [ 1, 49]], dtype=int64)

### Using a polynomial kernel

In [53]:
X = iris["data"]
X_scaled = std_scaler.fit_transform(X)
# Create a classifier with a polynomial kernel and fit this to the data
from sklearn.svm import SVC
poly_k_svm_clf = SVC(kernel="poly", degree=7, coef0=1, C=5) # degree 7 gives a perfect classifier!
poly_k_svm_clf.fit(X_scaled, y)
poly_k_preds = poly_k_svm_clf.predict(X_scaled)
confusion_matrix(y, poly_k_preds)

array([[100,   0],
       [  0,  50]], dtype=int64)

### Using a Radial Basis Function (RBF) kernel

In [59]:
rbf_svm_clf = SVC(kernel="rbf", gamma=5, C=1) # Can also explore values of gamma and C - C in particular - ones from lecture work terribly!
rbf_svm_clf.fit(X_scaled, y)
rbf_preds = rbf_svm_clf.predict(X_scaled)
confusion_matrix(y, rbf_preds)

array([[100,   0],
       [  0,  50]], dtype=int64)

### Multi-Class Classification using SVMs

In [64]:
from sklearn.model_selection import train_test_split

wX, wy = datasets.load_wine(return_X_y=True)
wX_train_set, wX_test_set, wy_train_set, wy_test_set = train_test_split(wX, wy, test_size=0.25, random_state=42)

from sklearn.multiclass import OneVsRestClassifier
# ovr_svc_clf = OneVsRestClassifier(SVC(kernel="rbf", gamma=5, C=1)) # e.g. does badly
ovr_svc_clf = OneVsRestClassifier(SVC(kernel="poly", degree=7, coef0=1, C=5)) # e.g. does better
ovr_svc_clf.fit(wX_train_set, wy_train_set)

wine_preds = ovr_svc_clf.predict(wX_test_set)
confusion_matrix(wy_test_set, wine_preds)

array([[15,  0,  0],
       [ 1, 15,  2],
       [ 0,  4,  8]], dtype=int64)

### SVM Regression

In [65]:
bX, by = datasets.load_boston(return_X_y=True)
bXtrain, bXtest, bytrain, bytest = train_test_split(bX, by, test_size=0.2, random_state=42)

from sklearn.svm import LinearSVR

svm_reg = LinearSVR(epsilon=8, random_state=42) # Can explore impact of epsilon
svm_reg.fit(bXtrain, bytrain)
boston_train_preds = svm_reg.predict(bXtrain) # Check fit

from sklearn.metrics import mean_squared_error

lin_mse = mean_squared_error(bytrain, boston_train_preds)
lin_rmse = np.sqrt(lin_mse)
lin_rmse


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

9.769916435215908

In [66]:
# Check on test set
boston_test_preds = svm_reg.predict(bXtest)
lin_mse = mean_squared_error(bytest, boston_test_preds)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

10.620721705196468

In [67]:
# Also explore use of kernels...
from sklearn.svm import SVR
svm_poly_reg = SVR(kernel="poly", degree=2, C=100, epsilon=0.1, gamma="scale")
svm_poly_reg.fit(bXtrain, bytrain)
boston_poly_train_preds = svm_poly_reg.predict(bXtrain) # Check fit
lin_mse = mean_squared_error(bytrain, boston_poly_train_preds)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

6.8655034790994325

In [68]:
# Check on test set
boston_poly_test_preds = svm_poly_reg.predict(bXtest)
lin_mse = mean_squared_error(bytest, boston_poly_test_preds)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

5.675206928797199

## Decision Trees

### Decision Tree Classification

In [75]:
wX, wy = datasets.load_wine(return_X_y=True)
wX_train_set, wX_test_set, wy_train_set, wy_test_set = train_test_split(wX, wy, test_size=0.25, random_state=42)

from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier(max_depth=7) # can explore depth and other parameters
tree_clf.fit(wX_train_set, wy_train_set)

wine_preds = tree_clf.predict(wX_test_set)
confusion_matrix(wy_test_set, wine_preds)

array([[14,  1,  0],
       [ 0, 18,  0],
       [ 0,  1, 11]], dtype=int64)

### Decision Tree Regression

In [79]:
from sklearn.tree import DecisionTreeRegressor

bX, by = datasets.load_boston(return_X_y=True)
bXtrain, bXtest, bytrain, bytest = train_test_split(bX, by, test_size=0.2, random_state=42)

tree_reg = DecisionTreeRegressor(min_samples_leaf=3) # explore min_samples = e.g. put to 1 to see what happens
tree_reg.fit(bXtrain,bytrain)

boston_tree_train_preds = tree_reg.predict(bXtrain) # Check fit

lin_mse = mean_squared_error(bytrain, boston_tree_train_preds)
lin_rmse = np.sqrt(lin_mse)
lin_rmse


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

1.9590746788623798

In [80]:
# Check on test set
boston_tree_test_preds = tree_reg.predict(bXtest)
lin_mse = mean_squared_error(bytest, boston_tree_test_preds)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

3.1922849833673492