In [1]:
import numpy as np

# Let's create feature arrays for training and testing to simulate the assignment
train_feats = np.random.randint(10, size=(5, 10))
test_feats = np.random.randint(10, size=(2, 10))

print(f'Training features: \n{train_feats}')
print(f'Test features: \n{test_feats}')

# Semantics of this features matrix:
# We have 5 tweets in training data with 10 unique word and 2 tweets in test data.
# Number are the counts of these unique words in the tweets!

Training features: 
[[5 1 9 7 3 8 1 1 9 0]
 [0 8 2 4 9 1 8 9 3 2]
 [0 1 9 8 0 0 9 6 9 4]
 [9 3 5 9 8 4 3 0 3 6]
 [8 3 2 5 2 5 0 8 2 0]]
Test features: 
[[7 7 6 1 3 6 5 9 0 5]
 [2 6 9 4 8 8 1 1 3 5]]


In [2]:
# Let's create label vectors for both training and testing data
train_labels = np.array([0,2,2,0,2])
test_labels = np.array([2])

print(f'Training labels: {train_labels}')
print(f'Test labels: {test_labels}')

# Semantics of this label vectors:
# We have 4 unique labels [0,1,2,3]
# These two vectors consists of a label for each sentence in training (5) and test (2)

Training labels: [0 2 2 0 2]
Test labels: [2]


In [4]:
# For the KNN implementation:
# First we need to calculate distance between each test features and train features

# For the efficiency, you can use a scipy function:
from scipy.spatial.distance import cdist 

# 2D array of distances between all test and all training tweets
# Shape (Test X Train)
distance_values = cdist(test_feats, train_feats, 'cosine')
print(f'Cosine distances between test and train features: \n{distance_values}')

# In this matrix, each row index represent a test tweet and each colomn index represent a training tweet
# Therefore, if you want to obtain cosine distance between 1st test tweet and 3rd training tweet:
print(f'Cosine distance: {distance_values[0,2]}')

# Note that these values are cosine distances, if you want to check cosine similarity:
# cosine_simimilarity = 1 - cosine_distance
print(f'Cosine similarity: {1 - distance_values[0,2]}')

K = 3

# After that you need to sort this values to obtain K closest training tweets over each test tweet:
# You can use np.argsort() for sorting


# Later, you need to obtain labels of N neighbours and do voting to find max voted prediction for each tweet
# You can use np.bincount again to sum votes for each class and select max voted class with np.argmax

# You can also do with for loop if you find it easier !!!

Cosine distances between test and train features: 
[[0.4414118  0.2565371  0.43814221 0.32575678 0.1880213 ]
 [0.2103133  0.36597054 0.45015019 0.19090417 0.41979889]]
Cosine distance: 0.4381422129053477
Cosine similarity: 0.5618577870946523
Closest K neighbours per each test tweet: [[4 1 3]
 [3 0 1]]
