# Loading the data

In [17]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')    # To suppress all the warnings in the notebook.
pd.set_option('mode.chained_assignment', None) # To suppress pandas warnings.
np.set_printoptions(precision=4) # To display values only upto four decimal places.

In [1]:
# import load_iris function from datasets module
from sklearn.datasets import load_iris

# save "bunch" object containing iris dataset and its attributes
iris = load_iris()

# store feature matrix in "X"
X = iris.data

# store response vector in "y"
y = iris.target

In [2]:
# print the shapes of X and y
print(X.shape)
print(y.shape)

(150, 4)
(150,)


# scikit-learn 4-step modeling pattern
Step 1: Import the class you plan to use

In [3]:
from sklearn.neighbors import KNeighborsClassifier


Step 2: "object creation" the "estimator"

"Estimator" is scikit-learn's term for model
"Instantiate" means "make an instance of"

In [6]:
knn = KNeighborsClassifier(n_neighbors=3)

* Name of the object does not matter
* Can specify tuning parameters (aka "hyperparameters") during this step
* All parameters not specified are set to their defaults

In [7]:
print(knn)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')


Step 3: Fit the model with data (aka "model training")

Model is learning the relationship between X and y
Occurs in-place

In [8]:
knn.fit(X, y) # whole data without split

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

Step 4: Predict the response for a new observation

New observations are called "out-of-sample" data
Uses the information it learned during the model training process

In [11]:
X_new = [[3, 5, 4, 2], [5, 4, 3, 2]]
knn.predict(X_new)

array([1, 1])

In [14]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

# use train/test split with different random_state values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

# check classification accuracy of KNN with K=5
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.9736842105263158


# Using a different classification model

In [12]:
# import the class
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg.fit(X, y)

# predict the response for new observations
logreg.predict(X_new)



array([2, 0])

In [15]:
# use train/test split with different random_state values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

# check classification accuracy of KNN with K=5
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
pred = logreg.predict(X_test)
print(metrics.accuracy_score(y_test, pred))

0.9210526315789473


