### This is a simple notebook to build and visualize the kNN algorithm.

It accompanies Chapter 2 of the book.

Author: Viviana Acquaviva

In [None]:
import numpy as np

import matplotlib

import matplotlib.pyplot as plt

import matplotlib.patches as mpatches

import pandas as pd 

import sklearn

from sklearn.model_selection import train_test_split # we don't use it here, but it's a useful function!

from sklearn.tree import DecisionTreeClassifier # how methods are imported 

from sklearn import metrics # this will give us access to evaluation metrics

from sklearn import neighbors # here comes the method of the day

In [None]:
font = {'size'   : 20}
matplotlib.rc('font', **font)
matplotlib.rc('xtick', labelsize=20) 
matplotlib.rc('ytick', labelsize=20) 
matplotlib.rcParams['figure.dpi'] = 300

### Read in data from file

In [None]:
LearningSet = pd.read_csv('../data/HPLearningSet.csv')

LearningSet = LearningSet.drop(LearningSet.columns[0], axis=1) #We want to drop the first column of the file

In [None]:
#By now we know data frames

LearningSet #Visualizes the first 5 rows

### Let's pick the same train/test set we had in the exercise

In [None]:
TrainSet =  LearningSet.iloc[:13,:] #.iloc is used to slice data frames using positional indexes

TestSet = LearningSet.iloc[13:,:]

### We split the train and test sets in features and label

In [None]:
Xtrain = TrainSet.drop(['P_NAME','P_HABITABLE'],axis=1) #This contains stellar mass, period, and distance

Xtest = TestSet.drop(['P_NAME','P_HABITABLE'],axis=1)  #This contains stellar mass, period, and distance

In [None]:
ytrain = TrainSet.P_HABITABLE #This contains the ground truth label, or output

ytest = TestSet.P_HABITABLE #This contains the ground truth  label, or output

### We are now ready to deploy the kNN (k Nearest Neighbor) algorithm.

It's a simple algorithm based on the idea of distance: we look for the k (an integer) objects that are closest to the one we would like to classify, and take the majority vote among the k classes of the k neighbors.

If you are wondering: what is even there to fit?

I had the same question, and found some solace in [this post](https://stats.stackexchange.com/questions/349842/why-do-we-need-to-fit-a-k-nearest-neighbors-classifier).

In [None]:
model = neighbors.KNeighborsClassifier(n_neighbors = 3)

In [None]:
model

# Learning Check-In: 

How would you code increasing neighbors to 5? Test your code in the cell below.

In [None]:
# Enter code in this cell



<details><summary><b>Click here for the answer!</b></summary>
<p>

```python
model = neighbors.KNeighborsClassifier(n_neighbors = 5)
```

</p>
</details>

### For visualization purposes, let's use only the first two features to build the model.

#### Build model by fitting training set; predict labels for test set

In [None]:
# We can chain the fit/predict process like this, or use the fit_predict method

model.fit(Xtrain.iloc[:,:2],ytrain) #this fits the model, which can then be used to predict stuff

ytestpred = model.predict(Xtest.iloc[:,:2]) #this uses the fitted model to predict the labels from the 5 objects in test set

In [None]:
ytestpred

### Learning Check-in
   
Can you predict the labels for the training set? What is the correct code? Test your code in the cell below.


In [None]:
# Enter code in this cell



<details><summary><b>Click here for the answer!</b></summary>
<p>

```
ytrainpred = model.predict(Xtrain.iloc[:,:2])
```

</p>
</details>

In [None]:
ytestpred, ytest.values #compare

#### Calculate accuracy on the train set and on the test set (train score and test score)

# Learning Check-In:

Calculate the accuracy on the train set and on the test set (train score and test score)

<details><summary><b>Click here for the answer!</b></summary>
<p>
   
```markdown
~ 0.692
    
0.8
```
   
</p>
</details>

In [None]:
print(metrics.accuracy_score(ytrain, model.predict(Xtrain.iloc[:,:2]))) #This compares the true labels for the train set with the predicted labels fro the train set

print(metrics.accuracy_score(ytest, model.predict(Xtest.iloc[:,:2]))) #This compares the true labels for the test set with the predicted labels fro the test set
                                                                      #(same that we did above)  

# Learning Check-in

Great! Now what would the train and test accuracy be if we increased neighbors to 5?

<details><summary><b>Click here for the answer!</b></summary>
<p>
   
```
~ 0.615
0.8
```
   
</p>
</details>



#### After fitting and predicting, we can access the k neighbors for each element in the test set like this:

In [None]:
model.kneighbors(Xtest.iloc[:,:2]) #the first element gives the distances, the second the index

### Let's now visualize our results, similarly to what we did for the DT.

We can use the largest distances as the radius of the circles - every point inside the circle is a neighbor!


In [None]:
for i in range(len(TestSet)): # cycle through elements of the test set
    
    print(model.kneighbors(Xtest.iloc[:,:2])[0][i,2]) # this prints out the third element of the distances vector

In [None]:
plt.figure(figsize=(10,6))

cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ['#20B2AA','#FF00FF'])

a = plt.scatter(TrainSet['S_MASS'], TrainSet['P_PERIOD'], marker="$\u2606$", facecolor = 'none',\
            c = TrainSet['P_HABITABLE'], s = 100, label = 'Train', cmap=cmap)

a = plt.scatter(TestSet['S_MASS'], TestSet['P_PERIOD'], marker="$\u25EF$",facecolors = 'none',\
            c = TestSet['P_HABITABLE'], s = 100, label = 'Test', cmap=cmap)

for i in range(len(TestSet)): #plot neighbors

    circle1=plt.Circle((TestSet['S_MASS'].iloc[i],TestSet['P_PERIOD'].iloc[i]),model.kneighbors(Xtest.iloc[:,:2])[0][i,2],\
                       lw = 0.7, edgecolor='k',facecolor='none')
    plt.gca().add_artist(circle1)
    
plt.gca().set_aspect(1)

bluepatch = mpatches.Patch(color='#20B2AA', label='Not Habitable')
magentapatch = mpatches.Patch(color='#FF00FF', label='Habitable')

plt.legend();

ax = plt.gca()
leg = ax.get_legend()
leg.legendHandles[0].set_color('k')
leg.legendHandles[0].set_facecolor('none')
leg.legendHandles[1].set_color('k')
leg.legendHandles[1].set_facecolor('none')


plt.legend(handles=[leg.legendHandles[0],leg.legendHandles[1], magentapatch, bluepatch],\
           loc = 'upper left', fontsize = 14)

plt.xlim(-130,70)
plt.ylim(0,140)
plt.xlabel('Mass of Parent Star (Solar Mass Units)')
plt.ylabel('Period of Orbit (days)');

#plt.savefig('HabPlanetsKNN2features.png', dpi = 300)

### Do you notice any issue here?

### If one dimension has a much bigger range than others, it will dominate the decision process. This issue can be solved by <b>scaling</b>. Scaling is a very important pre-processing step for most ML algorithms.

See some examples of different scaling algorithms [here](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html).

We will go with RobustScaler, which is more resistant to outliers than the standard version.


In [None]:
scaler = sklearn.preprocessing.RobustScaler()

In [None]:
scaler.fit(Xtrain) # important: we only scale the train set.

In [None]:
scaledXTrain = scaler.transform(Xtrain)

In [None]:
scaledXTrain

In [None]:
scaledXtest = scaler.transform(Xtest) # note that these are now numpy arrays, not data frames

In [None]:
scaler.inverse_transform #This unscales

In [None]:
model.fit(scaledXTrain[:,:2],ytrain).predict(scaledXtest[:,:2])

In [None]:
model.kneighbors(scaledXtest[:,:2]) #The distances of neighbors for test set objects look more balanced

In [None]:
plt.figure(figsize=(10,6))#, aspect_ratio = 'equal')
cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ['#20B2AA','#FF00FF'])
plt.scatter(scaledXTrain[:,0], scaledXTrain[:,1], marker = '*',\
            c = ytrain, s = 100, label = 'Train', cmap=cmap) #, 

plt.scatter(scaledXtest[:,0], scaledXtest[:,1], marker = 'o',\
            c = ytest, s = 100, label = 'Test', cmap=cmap) #label = ,

for i in range(len(TestSet)):

    circle1=plt.Circle((scaledXtest[i,0],scaledXtest[i,1]),model.kneighbors(scaledXtest[:,:2])[0][i,2],\
                       edgecolor='k',facecolor='none', lw = 0.7)
    plt.gca().add_artist(circle1)

plt.gca().set_aspect(1)

plt.legend()

ax = plt.gca()
leg = ax.get_legend()
leg.legendHandles[0].set_color('k')
#leg.legendHandles[0].set_facecolor('none')
leg.legendHandles[1].set_color('k')
#leg.legendHandles[1].set_facecolor('none')


plt.legend(handles=[leg.legendHandles[0], leg.legendHandles[1]], loc = 'upper right', fontsize = 14)

plt.xlabel('Mass of Parent Star (Earth Mass Units)')
plt.ylabel('Period of Orbit (days)');


plt.xlim(-2.5,2.5)
plt.ylim(-1.,2.5);

#plt.savefig('HabPlanetsKNNscaled.png', dpi = 300)

### Note: for the purpose of application (not visualization), we should use all three features.

### Final remarks:
    
kNN needs scaling! Does DT have the same issue?

Any thoughts on strengths/weaknesses?