# KNN Classification

In [1]:
import pandas as pd
import numpy as np
import os
import math
from nltk.corpus import stopwords

In [2]:
stopWords = set(stopwords.words('english'))

In [3]:
class KNN:
    
    def __init__(self):
        self.tf_idf = pd.DataFrame()
        self.terms = []
        self.docs = []
        self.idf = []
        
    def buildVS(self):
        tDocs = ['classifications/engineering/'+i for i in os.listdir('classifications/engineering')][:8]
        tDocs.extend(['classifications/medicine/'+i for i in os.listdir('classifications/medicine')][:8])
        tDocs.extend(['classifications/ca/'+i for i in os.listdir('classifications/ca')][:8])
        for doc in tDocs:
            with open(doc, "r") as f:
                text = f.read().lower().replace(',','').replace('.','').replace('?','')
                self.docs.append(text)
                self.terms.extend(text.split())
        self.terms = list(set(self.terms).difference(stopWords))
        self.terms = sorted(self.terms)
        self.tf_idf = pd.DataFrame([[0 for i in range(len(self.terms))] for j in range(len(self.docs))])
        self.tf_idf.index = [f'Doc{i}' for i in range(len(self.docs))]
        self.tf_idf.columns = self.terms
        for i in range(len(self.docs)):
            for j in range(len(self.terms)):
                if self.docs[i].count(self.terms[j]) != 0:
                    tf = 1 + math.log10(self.docs[i].count(self.terms[j]))
                else:
                    tf = 0
                self.tf_idf.iloc[i, j] = tf
        self.idf = [0 for i in range(len(self.terms))]
        for i in range(len(self.terms)):
            self.idf[i] = math.log10(len(self.docs) / (self.tf_idf.iloc[:, i] != 0).sum())
            self.tf_idf.iloc[:, i] *= self.idf[i]
    
    def cosineSimilarity(self, vec1, vec2):
        dotProduct = vec1.dot(vec2)
        normVec1 = np.linalg.norm(vec1)
        normVec2 = np.linalg.norm(vec2)
        if normVec1 != 0 and normVec2 != 0:
            return dotProduct / (normVec1 * normVec2)
        else:
            return 0
        
    def queryVector(self, query, k):
        classes = ['engineering', 'medicine', 'charted accountant']
        votes = np.array([0, 0, 0])
        
        cosineSim = []
        queryTerms = query.lower().split()
        queryVec = pd.Series([0] * len(self.terms), index=self.terms)
        
        for qTerm in queryTerms:
            if qTerm in self.terms:
                queryVec[qTerm] = 1
        
        for i in range(len(self.docs)):
            cosineSim.append(self.cosineSimilarity(queryVec, self.tf_idf.iloc[i]))
        
        ranks = np.array(cosineSim).argsort()[::-1][:24]
        for i in range(k):
            if ranks[i] in range(0, 8):
                votes[0] += 1
            elif ranks[i] in range(8, 15):
                votes[1] += 1
            elif ranks[i] in range(16, 23):
                votes[2] += 1
        top = votes.argsort()[::-1][:1]
        print(f"It is classified into: {classes[top[0]]}")

In [4]:
knn = KNN()

In [6]:
knn.buildVS()

In [7]:
test = input("Enter test document: ")
k = int(input("Enter value of k: "))
knn.queryVector("design and analysis of algorithms", 7)

Enter test document:  Chartered accountants are also involved in insolvency and bankruptcy They help businesses and individuals to reorganize their finances or liquidate their assets This helps to protect creditors and other stakeholders
Enter value of k:  7


It is classified into: charted accountant
