# KNN

## 1.Load the dataset (iris.csv).

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

In [None]:
from sklearn.datasets import load_iris

data = load_iris()

In [None]:
df = pd.DataFrame(data.data, 
                  columns=data.feature_names)

In [None]:
df.head()

In [None]:
df['target'] = data.target

In [None]:
df.head()

> in target, we have 3 classes which are---

In [None]:
df['target'].value_counts(), data.target_names

In [None]:
df.info()

In [None]:
df.describe()

## 2.Split dataset into test and train (20:80).

In [None]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 3.Build KNN classifier with k value as 2 for identifying the flower Species.

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn2 = KNeighborsClassifier(n_neighbors=2).fit(X_train, y_train)

## 4.Build KNN classifier with k value as 4 for identifying the flower Species.

In [None]:
knn4 = KNeighborsClassifier(n_neighbors=4).fit(X_train, y_train)

## 5.Evaluate the step-3 and step-4.

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

- for step -- 3

In [None]:
accuracy_score(y_test, knn2.predict(X_test)) * 100

In [None]:
sns.heatmap(confusion_matrix(y_test, knn2.predict(X_test)),
            annot=True)

- for step -- 4

In [None]:
accuracy_score(y_test, knn4.predict(X_test)) * 100

In [None]:
sns.heatmap(confusion_matrix(y_test, knn4.predict(X_test)),
            annot=True)

## 6.Design a method for calculating the distance between data points for the given dataset

In [None]:
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return np.sqrt(distance)

## 7.Design a method for finding the nearest neighboursof a given data point using the above method.

In [None]:
def get_neighbors(train, test_row, num_neighbors):
    distances = list()
    for train_row in train:
        dist = euclidean_distance(test_row, train_row)
        distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append(distances[i][0])
    return neighbors

## 8.Design a method predicting the data point using the above two methods.

In [None]:
def predict_classification(train, test_row, num_neighbors):
    neighbors = get_neighbors(train, test_row, num_neighbors)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values), key=output_values.count)
    return prediction

In [None]:
print(f"Original - 1, Predicted - {int(predict_classification(df.values, [5.7,2.9,4.2,1.3], num_neighbors=5))}")

## 9.Choose any dataset from Kaggle or UCI repository suitable for regression and apply KNN algorithm for regression.

In [None]:
df = pd.read_csv('insurance.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

- no null values

In [None]:
df.describe()

In [None]:
for i in df.columns:
    if df[i].dtype == 'O':
        print(i)

- we have 3 categorical columns

In [None]:
df.head()

In [None]:
for i in df.columns:
    if df[i].dtype == 'O':
        print(df[i].value_counts())

- converting nominal categorical into numerical

In [None]:
df['sex'] = df['sex'].map({'male': 0,
                          'female': 1})

df['smoker'] = df['smoker'].map({'no': 0,
                              'yes': 1})

df['region'] = df['region'].map({'southeast': 0,
                                 'southwest': 1,
                                 'northwest': 2,
                                 'northeast' : 3})

In [None]:
df.head()

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# without feature scaling

from sklearn.neighbors import KNeighborsRegressor

reg = KNeighborsRegressor(n_neighbors=9).fit(X_train, y_train)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
print(f"R2 score {r2_score(y_test, reg.predict(X_test))}")

In [None]:
print(f"mean absolute error {mean_absolute_error(y_test, reg.predict(X_test))}")

In [None]:
print(f"mean squared error {mean_squared_error(y_test, reg.predict(X_test))}")

## 10.Evaluate the designed regression model with appropriate metric.

In [None]:
# feature scaling

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

In [None]:
X_train = sc.fit_transform(X_train)

X_test = sc.transform(X_test)

In [None]:
reg1 = KNeighborsRegressor(n_neighbors=6).fit(X_train, y_train)

In [None]:
print(f"R2 score {r2_score(y_test, reg1.predict(X_test))*100}%")

In [None]:
print(f"mean absolute error {mean_absolute_error(y_test, reg1.predict(X_test))}")

In [None]:
print(f"mean squared error {mean_squared_error(y_test, reg1.predict(X_test))}")