In [14]:
from collections import Counter
import math

def knn(data, query, k, distance_fn, choice_fn):
    neighbor_distances_and_indices = []
    
    # 3. For each example in the data
    for index, example in enumerate(data):
        # 3.1 Calculate the distance between the query example and the current
        # example from the data.
        distance = distance_fn(example[:-1], query)
        
        # 3.2 Add the distance and the index of the example to an ordered collection
        neighbor_distances_and_indices.append((distance, index))
    
    # 4. Sort the ordered collection of distances and indices from
    # smallest to largest (in ascending order) by the distances
    sorted_neighbor_distances_and_indices = sorted(neighbor_distances_and_indices)
    
    # 5. Pick the first K entries from the sorted collection
    k_nearest_distances_and_indices = sorted_neighbor_distances_and_indices[:k]
    
    # 6. Get the labels of the selected K entries
    k_nearest_labels = [data[i][1] for distance, i in k_nearest_distances_and_indices]

    # 7. If regression (choice_fn = mean), return the average of the K labels
    # 8. If classification (choice_fn = mode), return the mode of the K labels
    return k_nearest_distances_and_indices , choice_fn(k_nearest_labels)

def mean(labels):
    return sum(labels) / len(labels)

def mode(labels):
    return Counter(labels).most_common(1)[0][0]

def euclidean_distance(point1, point2):
    sum_squared_distance = 0
    for i in range(len(point1)):
        sum_squared_distance += math.pow(point1[i] - point2[i], 2)
    return math.sqrt(sum_squared_distance)

def main():
    '''
    # Regression Data
    # 
    # Column 0: height (inches)
    # Column 1: weight (pounds)
    '''
    reg_data = [
       [65.75, 112.99],
       [71.52, 136.49],
       [69.40, 153.03],
       [68.22, 142.34],
       [67.79, 144.30],
       [68.70, 123.30],
       [69.80, 141.49],
       [70.01, 136.46],
       [67.90, 112.37],
       [66.49, 127.45],
    ]
    
    # Question:
    # Given the data we have, what's the best-guess at someone's weight if they are 60 inches tall?
    reg_query = [70]
    reg_k_nearest_neighbors, reg_prediction = knn(
        reg_data, reg_query, k=3, distance_fn=euclidean_distance, choice_fn=mean
    )
    
    '''
    # Classification Data
    # 
    # Column 0: age
    # Column 1: likes pineapple
    '''
    clf_data = [
       [22, 1],
       [23, 1],
       [21, 1],
       [18, 1],
       [19, 1],
       [25, 0],
       [27, 0],
       [29, 0],
       [31, 0],
       [45, 0],
    ]
    # Question:
    # Given the data we have, does a 33 year old like pineapples on their pizza?
    clf_query = [20]
    clf_k_nearest_neighbors, clf_prediction = knn(
        clf_data, clf_query, k=3, distance_fn=euclidean_distance, choice_fn=mode
    )
    print(clf_prediction)

if __name__ == '__main__':
    main()


1


In [15]:
def recommend_movies(movie_query, k_recommendations):
    raw_movies_data = []
    with open(r'C:\Users\PIYUSH\Desktop\movies\movies_recommendation_data.csv', 'r') as md:
        # Discard the first line (headings)
        next(md)

        # Read the data into memory
        for line in md.readlines():
            data_row = line.strip().split(',')
            raw_movies_data.append(data_row)

    # Prepare the data for use in the knn algorithm by picking
    # the relevant columns and converting the numeric columns
    # to numbers since they were read in as strings
    movies_recommendation_data = []
    for row in raw_movies_data:
        data_row = list(map(float, row[2:]))
        movies_recommendation_data.append(data_row)

    # Use the KNN algorithm to get the 5 movies that are most
    # similar to The Post.
    recommendation_indices, _ = knn(
        movies_recommendation_data, movie_query, k=k_recommendations,
        distance_fn=euclidean_distance, choice_fn=lambda x: None
    )

    movie_recommendations = []
    for _, index in recommendation_indices:
        movie_recommendations.append(raw_movies_data[index])

    return movie_recommendations

if __name__ == '__main__':
    the_post = [7.2, 1, 1, 0, 0, 0, 0, 1, 0] # feature vector for The Post
    recommended_movies = recommend_movies(movie_query=the_post, k_recommendations=5)

    # Print recommended movie titles
    for recommendation in recommended_movies:
        print(recommendation[1])

12 Years a Slave
Hacksaw Ridge
Queen of Katwe
The Wind Rises
A Beautiful Mind


In [18]:
from sklearn.datasets import fetch_mldata
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

# The data that is imported is copied into the below variable

custom_data_home = "./"
mnist = fetch_mldata('MNIST original', data_home=custom_data_home)

# Variables used to define the number of traning and testing examples from the sample
traning_examples = 6000
testing_examples = 1000
total_examples = 60000
class_labels = 10

# Uses array from numpy for the mnist dat to convert to 32bit integer
mnist_data = np.array(mnist.data.shape, dtype=np.int32)
mnist_data = np.copy(mnist.data.astype(np.int32))


def knn_classifier():
    k = [1, 9, 19, 29, 39, 49, 59, 69, 79, 89, 99]
    # the imported data has labes that are orders
    # shuffle is used to shuffle the input image data labels
    shuffle = np.arange(total_examples)
    np.random.shuffle(shuffle)
    # Used to intilize zeros in the traning data variable and take first 6000 samples
    traning_data = np.zeros((traning_examples, mnist.data.shape[1]), dtype=np.int32)
    traning_data[:] = mnist_data[shuffle[:traning_examples]]
    traning_target = mnist.target[shuffle[:traning_examples]]
    # Used to intilize zeros in the testing data variables and the last 1000 samples
    testing_data = np.zeros((testing_examples, mnist.data.shape[1]), dtype=np.int32)
    testing_data[:] = mnist_data[shuffle[(total_examples - testing_examples): total_examples]]
    testing_target = mnist.target[shuffle[(total_examples - testing_examples): total_examples]]
    # Used to initilize zeros in the distance variable and convert it into 32 bit integer data type
    euclidean_distance = np.zeros((traning_examples, traning_examples), dtype=np.int32)
    euclidean_distance_copy = np.zeros((testing_examples, traning_examples), dtype=np.int32)
    # temporary variables to store ecudilian distance
    temporary = np.empty_like(euclidean_distance)
    temporary_copy = np.empty_like(euclidean_distance_copy)
    # Clasification varaible with traning and testing examples and their corrosponding labels
    classification = np.zeros((traning_examples, class_labels))
    classification_copy = np.zeros((testing_examples, class_labels))
    # Calcualtes ecudilian distance for traning data
    for i in range(0, traning_examples):
        euclidean_distance[i:i + 1, :] = \
            np.sqrt(np.sum(np.square(traning_data[:traning_examples, :] - traning_data[i, :]), axis=1))
    # Calculates eculidian data for testing data
    for i in range(0, testing_examples):
        euclidean_distance_copy[i:i + 1, :] = \
            np.sqrt(np.sum(np.square(traning_data[:, :] - testing_data[i, :]), axis=1))
    # Sorting the calcualted distance by size
    sorted_indices = np.argsort(euclidean_distance)
    sorted_indices_copy = np.argsort(euclidean_distance_copy)

    labels = traning_target[:][sorted_indices]
    labels_copy = traning_target[:][sorted_indices_copy]

    for i in range(0, traning_examples):
        temporary[i, :] = euclidean_distance[i, :][sorted_indices[i]]

    for i in range(0, testing_examples):
        temporary_copy[i, :] = euclidean_distance_copy[i, :][sorted_indices_copy[i]]

    euclidean_distance[:, :] = temporary[:, :]
    euclidean_distance_copy[:, :] = temporary_copy[:, :]
    # Used to delete the temporary variables created
    np.delete(temporary, np.s_[:], 1)
    np.delete(temporary_copy, np.s_[:], 1)

    traning_error = np.zeros((1, len(k)), dtype=np.float)
    testing_error = np.zeros((1, len(k)), dtype=np.float)

    for loop in k:
        for i in range(0, traning_examples):
            for j in range(0, loop):
                index = int(labels[i, j])
                classification[i, index] += 1

        for i in range(0, testing_examples):
            for j in range(0, loop):
                index = int(labels_copy[i, j])
                classification_copy[i, index] += 1

        temporary = np.argsort(classification)
        temporary_copy = np.argsort(classification_copy)

        for i in range(0, traning_examples):
            if temporary[i, class_labels - 1] != traning_target[i]:
                traning_error[0, k.index(loop)] += 1

        for i in range(0, testing_examples):
            if temporary_copy[i, class_labels - 1] != testing_target[i]:
                testing_error[0, k.index(loop)] += 1

        traning_error[0, k.index(loop)] /= traning_examples
        testing_error[0, k.index(loop)] /= testing_examples

    # plotting the graphs on same plane for errors vs value of K
    plt.xlabel('Value of K')
    plt.ylabel('Error')
    # red represents the test error
    # green represents the traning error
    red_box = mpatches.Patch(color='red', label='Test Error')
    green_box = mpatches.Patch(color='green', label='Traning Error')

    plt.legend(handles=[red_box, green_box])

    plt.plot(k, testing_error[0], color='red', marker='*')
    plt.plot(k, traning_error[0], color='g', marker='*')
    plt.title('knn Classifier')
    plt.show()


# Calling knn Classifier() module
knn_classifier()

TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond