# Finding an Observation's Nearest Neighbors

You need to find an observation's k nearest observation(neighbors)
Use scikit -learn's NearestNeighbors

In [1]:
# Load Libraries
from sklearn import datasets
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

In [2]:
# Load data
iris = datasets.load_iris()
features = iris.data

In [3]:
# create standardizer
standardizer = StandardScaler()

In [4]:
# Standardize features
features_standardized = standardizer.fit_transform(features)

In [5]:
# Two nearest neighbors
nearest_neighbors = NearestNeighbors(n_neighbors=2).fit(features_standardized)

In [6]:
# Create an obeservation
new_observation = [1, 1, 1, 1]

In [7]:
# find the distances and indices of the observation's nearest neighbors
distances, indices = nearest_neighbors.kneighbors([new_observation])

In [8]:
# view the nearest neighbors
features_standardized[indices]

array([[[1.03800476, 0.55861082, 1.10378283, 1.18556721],
        [0.79566902, 0.32841405, 0.76275827, 1.05393502]]])

 we can set the distance metric using the metric parameter

In [9]:
# view distances
distances

array([[0.49140089, 0.74294782]])

# Creating a K-Nearest Neighbor Classifier

Given an observation of unknown class, you need to predict its classbased on the class of its neighbors.
If the datasets is not very large, use KNeighborsClassifier

In [1]:
# Load Libraries 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import datasets


In [2]:
# Load data
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [3]:
# Create Standardizer
standardizer = StandardScaler()

In [4]:
# Standardize features
X_std = standardizer.fit_transform(X)

In [6]:
# Train a KNN Classifier with % neighbors
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1).fit(X_std, y)

In [7]:
# Create a new two observation 
new_observation = [[0.75, 0.75, 0.75, 0.75], [1, 1, 1, 1]]

In [8]:
# predict the class of two observation
knn.predict(new_observation)

array([1, 2])

In [9]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [10]:
# view the probality each observation is one of three classes
knn.predict_proba(new_observation)

array([[0. , 0.6, 0.4],
       [0. , 0. , 1. ]])

In [11]:
knn.predict(new_observation)

array([1, 2])

# Identifying the Best Neighborhood Size

You want to select the best value for k-nearest neighbors classifier.
Use model selection techniques like GridSearchCV

In [12]:
# Load Libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV

In [13]:
# Load data
iris = datasets.load_iris()
features = iris.data
target = iris.target

In [14]:
# Create standardizer
standardizer = StandardScaler()

In [15]:
# Standardize features
features_standardized = standardizer.fit_transform(features)

In [16]:
# Create a pipeline
pipe = Pipeline([('standardizer', standardizer), ('knn', knn)])

In [17]:
pipe

In [18]:
# Create space of canditate values
search_space = [{'knn__n_neighbors':[1,2,3,4,5,6,7,8,9,10]}]

In [19]:
# Create grid search
classifier = GridSearchCV(pipe, search_space, cv=5, verbose=0).fit(features_standardized, target)

when that is completed, we can see the k that produces the best model

In [20]:
# Best Neighborhood size (k)
classifier.best_estimator_.get_params()['knn__n_neighbors']

6

# Creating a Radius-Based Nearest Neighbor Classifier

Given an observation of unknown class, you need to predict its class based on the class of all observations within a certain distance. Use RadiusNeighborsClassifier

In [21]:
# Load Libraries
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn import datasets
from sklearn.preprocessing import StandardScaler

In [23]:
# Load data
iris = datasets.load_iris()
features = iris.data
target = iris.target

In [24]:
# Create standardizer
standardizer = StandardScaler()

In [25]:
# Standard features
features_standardized = standardizer.fit_transform(features)

In [26]:
# train a radius neighbors classifier
rnn = RadiusNeighborsClassifier(radius=.5, n_jobs=-1).fit(features_standardized, target)

In [28]:
# Create new observation
new_observation = [[1,1,1,1]]

In [29]:
# predict the class of two observation
rnn.predict(new_observation)

array([2])

In [31]:
features

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3