# Other Classification - KNN 

In [None]:
# import everything
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

We are going to use built-in datasets from `sklearn` for this example.


In [None]:
from sklearn import datasets

iris_data = pd.read_csv('iris-data-clean.csv')
iris_data.tail()

In [None]:
df = pd.DataFrame(iris_data, 
                columns=['sepal_length_cm', 'sepal width /cm', 'petal length /cm', 'petal width /cm', 'class'])
df = pd.DataFrame(iris_data)
df.tail

In [None]:
def myfunction(x):
    if x == "Setosa":
        return 0
    elif x == "Virginica":
        return 1
    else:
        return 2
    
df["class"] = df["class"].apply(myfunction)
df.head()

In [None]:
df.tail()

In [None]:
# we are going to use the fours features (sepal & petal - length & petal width) 
X = df.iloc[:,0:4]

# use 'class' as the target we're trying to predict
y = df['class']
X.head()

# Visualize the Data

In [None]:

sb.pairplot (df, hue='class')  # Show different levels of a categorical variable by the color of plot elements
plt.show()

### Train our Model

Now, we can do train test split, then use our training set to train our model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [None]:
X_train.head()

In [None]:

# we choose k = 3 here, due to the small number of data we have
# there isn't a k suitable for all. Generally we want to choose a small odd number.
# large k is generally more costy
knn_model = KNeighborsClassifier(n_neighbors = 3) 

knn_model.fit(X_train, y_train) # train our model to the data set we have. The training set is all the data points

In [None]:
y_pred = knn_model.predict(X_test)

In [None]:
print(y_pred)

### Model Validation

Like Linear Regression, we want to know how well our model predicts.
Since we are doing classification with Logistics Regression, we want to use 
`accuracy_core()` from `sklearn.metrics`.
A closer result to 1 means better prediction.

In [None]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_pred))

In [None]:
# we can use this model to predict any data values
print(knn_model.predict([[4.9, 3.5, 1.6, 0.25]]))

### Experimentation - choosing only two features

In [None]:
# we are going to choose only two features.

X2 = df.iloc[:, 2:4] 
y2 = df['class']
X2.head()

In [None]:
df_zero = df.loc[df['class'] == 0]
df_one = df.loc[df['class'] == 1]
df_two = df.loc[df['class'] == 2]
df_zero.head()

In [None]:
fig, ax = plt.subplots(figsize=(10,10))

ax.scatter(df_zero.iloc[:, 2:3], df_zero.iloc[:, 3:4])
ax.scatter(df_one.iloc[:, 2:3], df_one.iloc[:, 3:4])
ax.scatter(df_two.iloc[:, 2:3], df_two.iloc[:, 3:4])
ax.set(xlabel = 'petal length /cm', ylabel = 'petal width /cm')

In [None]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, random_state = 42)

In [None]:
X2_train.head()

In [None]:
knn_model2 = KNeighborsClassifier(n_neighbors = 3) 

knn_model2.fit(X2_train, y2_train)

In [None]:
y2_pred = knn_model2.predict(X2_test)
print(y2_pred)

In [None]:
print(y2_test)

In [None]:
print(accuracy_score(y2_test, y2_pred))