# KNN in TensorFlow

## Import Packages

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [2]:
tf.__version__

'2.0.0-beta1'

## Loading Data & Preprocessing

In [3]:
%%bigquery flights_df --verbose
SELECT date,
       airline,
       departure_airport,
       arrival_airport,
       departure_lat,
       departure_lon,
       arrival_lat,
       arrival_lon,
       departure_delay,
       arrival_delay,
       CASE WHEN (arrival_delay >= 15) THEN 1 ELSE 0 END AS delayed
    
FROM `bigquery-samples.airline_ontime_data.flights`
Where departure_airport = 'SFO' 
      AND date >= '2009-01-01' 
      AND date <= '2009-01-31'
      AND departure_delay <= 120
      AND departure_delay >= 1

Executing query with job ID: 6a6395ff-29b4-4b4e-b3a6-e7496b760a0c
Query executing: 0.61s
Query complete after 1.11s


In [4]:
%%bigquery arrival_airport_codes --verbose
SELECT DISTINCT(arrival_airport)
    
FROM `bigquery-samples.airline_ontime_data.flights`
Where departure_airport = 'SFO' 
      AND date >= '2009-01-01' 
      AND date <= '2009-12-31'

Executing query with job ID: 9d4ecbdb-c013-4b0e-8d94-48fc64b3f99f
Query executing: 0.53s
Query complete after 1.02s


In [5]:
flights_processed_df = (flights_df
                        .filter(['departure_delay', 'arrival_airport','delayed'])
                       )

flights_processed_df['arrival_airport'] = pd.Categorical(flights_processed_df['arrival_airport'], 
                                                         arrival_airport_codes['arrival_airport'])


In [6]:
flights_processed_df.head()

Unnamed: 0,departure_delay,arrival_airport,delayed
0,2.0,AUS,0
1,5.0,AUS,0
2,9.0,AUS,0
3,30.0,AUS,1
4,12.0,AUS,1


In [7]:
flights_processed_df.dtypes

departure_delay     float64
arrival_airport    category
delayed               int64
dtype: object

## KNN

In [8]:
features = flights_processed_df 
labels = flights_processed_df.pop('delayed')

### Feature preprocessing

In [9]:
features = pd.get_dummies(features)

In [10]:
features.head(3)

Unnamed: 0,departure_delay,arrival_airport_BUR,arrival_airport_EUG,arrival_airport_LGB,arrival_airport_MCO,arrival_airport_KOA,arrival_airport_EWR,arrival_airport_BZN,arrival_airport_PHX,arrival_airport_DTW,...,arrival_airport_SLC,arrival_airport_RDD,arrival_airport_PIT,arrival_airport_MCI,arrival_airport_CVG,arrival_airport_PHL,arrival_airport_OTH,arrival_airport_OGG,arrival_airport_MEM,arrival_airport_SAT
0,2.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
from sklearn.preprocessing import normalize
features = normalize(features, axis=0)

In [12]:
labels = pd.get_dummies(labels.astype(str)).to_numpy()

### Training-Testing-Split

In [13]:
np.random.seed(12)
split = 0.8
train_indices = np.random.choice(len(features), round(len(features) * split), replace = False)
test_indices = np.array(list(set(range(len(features))) - set(train_indices)))

train_features = features[train_indices]
test_features = features[test_indices]
train_labels = labels[train_indices]
test_labels = labels[test_indices]

### KNN Prediction Function

In [14]:
def knn_prediction(train_features, train_labels, test_features, k):
    
    # Calculate the absolute differences for each feature of the training and test data
    differences = tf.abs(tf.subtract(train_features, tf.expand_dims(test_features, axis = 1))) # Manhattan distance
    
    # Create the distance matrix by summing up the differences over all features
    difference_matrices = tf.reduce_sum(differences, axis = 2)
    
    # Find the K neighbors in the training data with the least distance
    _, k_neighbors_indices = tf.nn.top_k(tf.negative(difference_matrices), k = k) # there is no last k so we have to invert
    
    # Retrieve the labels of the neighbors
    k_neighbors_labels = tf.gather(train_labels, k_neighbors_indices)
    
    # Get the majority vote of the K nearest labels
    majority_vote = tf.argmax(tf.reduce_sum(k_neighbors_labels, axis = 1), axis = 1)
    
    # Return prediction
    return majority_vote

In [15]:
k = 3

In [16]:
predicted_labels = knn_prediction(train_features, train_labels, test_features, k)

In [17]:
predicted_labels.numpy()[:5]

array([1, 0, 1, 1, 1])

### Performance Evaluation

In [18]:
test_labels = tf.argmax(test_labels, axis = 1) # reconstruct test labels

In [19]:
results_df = pd.DataFrame({'Predicted': predicted_labels, 'Actual': test_labels})
results_df.head(5)

Unnamed: 0,Actual,Predicted
0,1,1
1,0,0
2,1,1
3,1,1
4,1,1


In [20]:
results_df = pd.DataFrame({'Predicted': predicted_labels, 'Actual': test_labels})

In [30]:
TP = len(results_df.query('Predicted == 1 & Actual == 1'))
TN = len(results_df.query('Predicted == 0 & Actual == 0'))

In [33]:
accuracy = (TP + TN) / len(results_df)
accuracy

0.8400556328233658