In [1]:
!pip install --quiet mrjob==0.7.4

In [2]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
import time

In [4]:
%%file MRKNNimplementation.py
from mrjob.job import MRJob
import numpy as np
from heapq import heappush, heappop, _heapify_max
import re

TEST_SET_PATH = "/content/gdrive/MyDrive/BigData/Dataset_Project/testing_data_1.csv"

class KNNJob(MRJob):
    def mapper_init(self):
        self.k_neighbors = 5
        with open(TEST_SET_PATH, 'r') as file:
            test_set_lines = file.readlines()

        test_data = []
        for idx, line in enumerate(test_set_lines):
            if idx > 0:  # Skipping the header row
                features = list(map(int, line.strip().split(',')))
                test_data.append(features)

        self.test_data = test_data

    def mapper(self, _, training_line):
        training_record = training_line.strip().split(',')
        if training_record[10] != "hand":  # Skipping the header row
            training_features = np.array(list(map(int, training_record[:10])))
            training_label = int(training_record[10])

            for test_record in self.test_data:
                test_id = test_record[0]
                test_features = np.array(test_record[1:])
                distance = np.linalg.norm(training_features - test_features)
                yield test_id, [distance, training_label]

    def reducer_init(self):
        self.k_neighbors = 5
        self.vote_counts = [0] * 10

    def reducer(self, test_id, neighbors_data):
        heap = []
        _heapify_max(heap)

        # Pushing all the neighbors distances and their corresponding labels to the heap
        for distance, label in neighbors_data:
            heappush(heap, [distance, label])
            if len(heap) > self.k_neighbors:
                heappop(heap)

        self.vote_counts = [0] * 10

        while heap:
            _, label = heappop(heap)
            self.vote_counts[label] += 1

        # Finding the class with the maximum count
        predicted_label = max(range(len(self.vote_counts)), key=lambda x: self.vote_counts[x])
        yield test_id, predicted_label

if __name__ == '__main__':
    KNNJob.run()

Overwriting MRKNNimplementation.py


In [5]:
knn_start_time = time.time()
!python MRKNNimplementation.py "/content/gdrive/MyDrive/BigData/Dataset_Project/training_data_1.csv"

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /tmp/MRKNNimplementation.root.20241118.130640.383063
Running step 1 of 1...
job output is in /tmp/MRKNNimplementation.root.20241118.130640.383063/output
Streaming final output from /tmp/MRKNNimplementation.root.20241118.130640.383063/output...
540	1
541	1
542	1
543	1
544	1
545	1
546	3
547	1
548	1
549	1
55	1
550	1
551	1
552	1
553	1
554	1
555	1
556	1
557	0
558	1
559	1
56	2
560	1
561	1
562	1
563	1
564	1
565	1
566	1
567	1
568	0
569	1
57	1
570	0
571	1
572	0
573	1
574	1
575	0
576	1
577	1
578	1
579	1
58	1
580	1
581	1
582	1
583	1
584	1
585	1
586	1
587	1
588	1
589	1
59	1
590	1
591	1
592	1
593	1
594	1
595	1
596	1
597	1
598	1
599	0
6	1
60	1
600	0
601	1
602	1
603	1
604	1
605	3
606	1
607	1
608	1
609	1
61	1
610	1
611	1
612	1
613	1
614	1
615	1
616	1
617	1
618	1
619	1
62	1
620	1
621	1
622	3
623	1
624	1
625	1
626	0
627	1
628	0
629	1
63	3
630	1
631	1
632	1
633	1
634	1
635	1
636	1
637	2
638

In [6]:
print(f"kNN computation completed in {time.time() - knn_start_time:.2f} seconds.")

kNN computation completed in 151.79 seconds.
