In [2]:
# Testing Iris Dataset

In [3]:
# Import statements
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import pandas as pd


In [4]:
iris = datasets.load_iris()
digits = datasets.load_digits()

In [5]:
# Check type
type(digits)
type(iris)

# Bunch are special containers

sklearn.utils.Bunch

In [6]:
# Example
print(digits.data)

[[ 0.  0.  5. ...  0.  0.  0.]
 [ 0.  0.  0. ... 10.  0.  0.]
 [ 0.  0.  0. ... 16.  9.  0.]
 ...
 [ 0.  0.  1. ...  6.  0.  0.]
 [ 0.  0.  2. ... 12.  0.  0.]
 [ 0.  0. 10. ... 12.  1.  0.]]


In [7]:
# Target
digits.target

array([0, 1, 2, ..., 8, 9, 8])

In [8]:
# Check size
digits.images[0]

array([[ 0.,  0.,  5., 13.,  9.,  1.,  0.,  0.],
       [ 0.,  0., 13., 15., 10., 15.,  5.,  0.],
       [ 0.,  3., 15.,  2.,  0., 11.,  8.,  0.],
       [ 0.,  4., 12.,  0.,  0.,  8.,  8.,  0.],
       [ 0.,  5.,  8.,  0.,  0.,  9.,  8.,  0.],
       [ 0.,  4., 11.,  0.,  1., 12.,  7.,  0.],
       [ 0.,  2., 14.,  5., 10., 12.,  0.,  0.],
       [ 0.,  0.,  6., 13., 10.,  0.,  0.,  0.]])

In [9]:
# Use Support Vector Machine
# Used for classification
from sklearn import svm
clf = svm.SVC(gamma=0.001, C=100.)

In [10]:
# Before you do anything, the object must learn from the model
clf.fit(digits.data[:-1], digits.target[:-1])

# -1 is from the digits data, is the assigned training data set
# -1 for both the original data and the target data

# What the output means:
# SVC - Classifier object
# C = 100, regularization parameter, considers trade-off between achieving low train error and
# low testing error
# Generalization error
# You want a large "C"

# gamma = 0.001
# how far the influence of a single example reaches
# low means "Far" 
# high means "close"

#  In English:
# High C - model tries to classify training examples correctly
# High C may even mean overfitting
# Model will have low bias, but high variance - trade-off

# Low C - model allows misclassifications on training data for simple, more generalized decision-boundary
# Prevents overfitting

# Gamma
# high gamma- each training example has very narrow area of influence, resulting in high complex decisions
# more overfitting
#

SVC(C=100.0, gamma=0.001)

In [11]:
digits.data[1]

array([ 0.,  0.,  0., 12., 13.,  5.,  0.,  0.,  0.,  0.,  0., 11., 16.,
        9.,  0.,  0.,  0.,  0.,  3., 15., 16.,  6.,  0.,  0.,  0.,  7.,
       15., 16., 16.,  2.,  0.,  0.,  0.,  0.,  1., 16., 16.,  3.,  0.,
        0.,  0.,  0.,  1., 16., 16.,  6.,  0.,  0.,  0.,  0.,  1., 16.,
       16.,  6.,  0.,  0.,  0.,  0.,  0., 11., 16., 10.,  0.,  0.])

In [12]:
clf.predict(digits.data[-1:])

array([8])

In [13]:
# Type Casting:
# inputs of float32 will maintain data type
# otherwise, will be cast as float64

# Generally recommended to use float32

# has performance benefits and reduced memory usage, reduced processing time for vector instructors
# might lead to numerical processing issues

In [14]:
# Import statements
import numpy as np
from sklearn import kernel_approximation

# Example code
rng = np.random.RandomState(0)
X = rng.rand(10, 2000)
X = np.a
X.dtype

# Transformer:
transformer = kernel_approximation.RBFSampler()
X_new = transformer.fit_transform(X)
X_new.dtype

AttributeError: module 'numpy' has no attribute 'a'

In [15]:
from sklearn import datasets
from sklearn.svm import SVC

In [16]:
# Datasets
iris = datasets.load_iris()

clf = SVC()

# Fit and train:
clf.fit(iris.data, iris.target)
SVC()

# Predict
list(clf.predict(iris.data[:3]))
[0, 0, 0]

clf.fit(iris.data, iris.target_names[iris.target]) # This step interprets the results, since 3 = setosa for example
SVC()

list(clf.predict(iris.data[:3]))

['setosa', 'setosa', 'setosa']

In [None]:
# Refit the params

# Pick the right estimator - what technique or model should you use? 
# https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

import numpy as np
from sklearn.datasets import load_iris
from sklearn.svm import SVC
X, y = load_iris(return_X_y=True)

clf = SVC()
clf.set_params(kernel='linear').fit(X, y)
clf.predict(X[:5])

clf.set_params(kernel='rbf').fit(X, y)
clf.predict(X[:5])