# Train a LinearSVC on a linearly separable dataset. Then train an SVC and a SGDClassifier on the same dataset. See if you can get them to produce roughly the same model.

In [1]:
import numpy as np
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split


iris = datasets.load_iris()
X = iris['data'][:, (2, 3)]  # petal length, petal width
y = (iris['target'] == 2).astype(np.float64)  # iris virginica

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                  test_size=0.25,  # 0.25 x 0.8 = 0.2
                                                  random_state=42)

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


linear_svc_pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('predict', LinearSVC())
])

linear_svc_pipeline.fit(X_train, y_train)
y_pred = linear_svc_pipeline.predict(X_test)
print(accuracy_score(y_test, y_pred))

1.0


In [16]:
from sklearn.svm import SVC


svc_pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('predict', SVC())
])
param_grid = {
    'predict__C': [1, 2, 3],
    'predict__gamma': [1, 2, 3],
    'predict__kernel': ['linear', 'poly', 'rbf'],
    'predict__degree': [1, 2, 3],
}
svc_search = GridSearchCV(svc_pipeline, param_grid, cv=3, verbose=0,
                                 scoring='accuracy', return_train_score=True)
svc_search.fit(X_train, y_train)
# {'predict__C': 1, 'predict__degree': 1, 'predict__gamma': 1, 'predict__kernel': 'linear'}
print(svc_search.best_params_)

y_pred = svc_search.best_estimator_.predict(X_test)
print(accuracy_score(y_test, y_pred))

{'predict__C': 1, 'predict__degree': 1, 'predict__gamma': 1, 'predict__kernel': 'linear'}
1.0


In [23]:
from sklearn.linear_model import SGDClassifier


sgd_pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('predict', SGDClassifier())
])
param_grid = {
    'predict__loss': ['hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    'predict__penalty': ['l2', 'l1', 'elasticnet'],
    'predict__alpha': [0.001, 0.0001, 0.01]
}
sgd_search = GridSearchCV(sgd_pipeline, param_grid, cv=3, verbose=0,
                          scoring='accuracy', return_train_score=True)
sgd_search.fit(X_train, y_train)
print(sgd_search.best_params_)

y_pred = sgd_search.best_estimator_.predict(X_test)
print(accuracy_score(y_test, y_pred))



{'predict__alpha': 0.001, 'predict__loss': 'squared_hinge', 'predict__penalty': 'elasticnet'}
1.0


In [26]:
print(accuracy_score(y_val, linear_svc_search.best_estimator_.predict(X_val)))
print(accuracy_score(y_val, svc_search.best_estimator_.predict(X_val)))
print(accuracy_score(y_val, sgd_search.best_estimator_.predict(X_val)))

0.9666666666666667
0.9666666666666667
0.9666666666666667


# Train an SVM classifier on the MNIST dataset. Since SVM classifiers are binary classifiers, you will need to use one-versus-the-rest to classify all 10 digits. You may want to tune the hyperparameters using small validation sets to speed up the process. What accuracy can you reach?

In [37]:
import numpy as np
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split


digits = datasets.load_digits()
X = digits['data']
y = digits['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [42]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('predict', SVC())
])
param_grid = {
    'predict__C': [1],
    'predict__gamma': [1],
    'predict__kernel': ['linear'],
    'predict__degree': [2],
    'predict__decision_function_shape': ['ovr']
}
grid_search = GridSearchCV(pipeline, param_grid, scoring='accuracy', cv=5,
                           verbose=0, return_train_score=True)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

y_pred = grid_search.best_estimator_.predict(X_test)
print(accuracy_score(y_test, y_pred))

{'predict__C': 1, 'predict__decision_function_shape': 'ovr', 'predict__degree': 2, 'predict__gamma': 1, 'predict__kernel': 'linear'}
0.975


# Train an SVM regressor on the California housing dataset.

In [3]:
from google.colab import drive
drive.mount('/content/drive')
dirpath = 'drive/MyDrive/MachineLearning/HandsOnMachineLearning/california-housing/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd
housing = pd.read_csv(dirpath + 'housing.csv')

In [27]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split


X = housing.drop(['median_house_value'], axis=1)
y = housing['median_house_value'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
                                                    random_state=42, stratify=housing[['ocean_proximity']])

In [28]:
X_train = X_train[:250]
y_train = y_train[:250]

In [44]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.svm import LinearSVR


num_attributes = list(X.drop(['ocean_proximity'], axis=1))
cat_attributes = ['ocean_proximity']

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])
transformation = ColumnTransformer([
    ('num', num_pipeline, num_attributes),
    ('cat', OneHotEncoder(), cat_attributes)
])

pipeline = Pipeline([
    ('transform', transformation),
    ('predict', SVR())
])

param_grid = {
    'predict__C': [5, 8, 10],
    'predict__gamma': [23, 25, 27, 'scale'],
    'predict__kernel': ['poly'],
    'predict__degree': [2],
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           verbose=1, return_train_score=True)
# {'predict__C': 8, 'predict__degree': 2, 'predict__gamma': 23, 'predict__kernel': 'poly'}
# 78474.94631726065
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

y_pred = grid_search.best_estimator_.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, y_pred)))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
{'predict__C': 8, 'predict__degree': 2, 'predict__gamma': 23, 'predict__kernel': 'poly'}
78474.94631726065
