# K-Nearest Neighbors using sklearn (for penguins' dataset from the book - 01_Machine-Learning-for-Absolute-Beginners.pdf by Oliver Theobald

In [1]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# Import dataset
df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv')

In [3]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [4]:
# Delete sex variable
del df['sex']

In [6]:
# Drop missing rows
df.dropna(axis = 0, how = 'any', subset = None, inplace = True)

In [7]:
# Convert non-numeric data using one-hot encoding
df = pd.get_dummies(df, columns=['island'])

In [8]:
# Scale independent variables by dropping the dependent variable (sex)
scaler = StandardScaler()
scaler.fit(df.drop('species',axis=1))
scaled_df = scaler.transform(df.drop('species',axis=1))

In [9]:
# Assign X and y variables
X = scaled_df
y = df['species']

In [10]:
# Split data into test/train set (70/30 split) and shuffle
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

In [11]:
# Assign algorithm 
model = KNeighborsClassifier(n_neighbors=5) # the default value is 5

In [12]:
# Fit algorithm to data
model.fit(X_train, y_train)

In [13]:
# Run algorithm on test data to make predictions
model_test = model.predict(X_test)

In [14]:
# Evaluate predictions
print(confusion_matrix(y_test, model_test))
print(classification_report(y_test, model_test))

[[42  1  0]
 [ 1 20  0]
 [ 0  0 39]]
              precision    recall  f1-score   support

      Adelie       0.98      0.98      0.98        43
   Chinstrap       0.95      0.95      0.95        21
      Gentoo       1.00      1.00      1.00        39

    accuracy                           0.98       103
   macro avg       0.98      0.98      0.98       103
weighted avg       0.98      0.98      0.98       103



In [15]:
# Data point to predict
penguin = [
	39, #bill_length_mm
	18.5, #bill_depth_mm
	180, #flipper_length_mm 
	3750, #body_mass_g
	0, #island_Biscoe    
	0, #island_Dream
	1, #island_Torgersen    
]

# Make prediction
new_penguin = model.predict([penguin])
new_penguin

array(['Gentoo'], dtype=object)

In [None]:
# that Gentoo answer is wrong actually need to get it as Adelie