In [4]:
import nltk
from nltk.corpus import stopwords
import numpy as np
from numpy.linalg import norm

In [25]:
class Document:
    def __init__(self, input):
        self.text = input.lower().replace('.', '').replace(',','')
        self.similar_documents = {}
        
    def vectorize(self, without_stopwords = True) -> dict:
        if without_stopwords:
            stop_words = set(stopwords.words('english'))
            words = [w for w in self.text.split() if not w in stop_words] # List comprehension for removing stopwords
        else:
            words = self.text.split()
        
        self.term_vector = {}
        
        for value in words:
            if value in self.term_vector.keys():
                self.term_vector[value] += 1
            else:
                self.term_vector[value] = 1
        return self.term_vector
    
    def compareDocuments(self, searchObject, without_stopwords = True):
        ''' 
        Method for comparing two term vectors and output the cosine similarity of the two. 
        
        Params:
        - searchObject ```object```: Object with search document
        - include_stopwords ```bool```: Determines wether or not to include stopwords in term vectors
        
        '''
        
        #Calling vectorize method as it is a pre-requisite for the method to work. Also allows the user to en- or disable the removal of stopwords
        self.vectorize(without_stopwords) 
        searchObject.vectorize(without_stopwords)
        
        mainDoc = [value for key, value in self.term_vector.items()]
        searcDoc = [0 for keys in self.term_vector.keys()] # Start with same dimension vector for searchDoc
        
        for keys, values in searchObject.term_vector.items():
            if keys in self.term_vector.keys():
                index_position = list(self.term_vector.keys()).index(keys)
                searcDoc[index_position] = values #Add values to equal index positions - so e.g. if bike are in both docs, the would be at the same position in the vector
            else:
                searcDoc.append(values) #If word not in main doc, add column with values
        
        if len(mainDoc) < len(searcDoc): #Ensuring that lenght of both vectors are equal
            for i in range(len(mainDoc), len(searcDoc)):
                mainDoc.append(0) #Adding zero, as the word from seach document will not be in main doc
                
        #Transforming into array for easier cosine calculation
        mainDoc = np.array(mainDoc) 
        searcDoc = np.array(searcDoc)         
        
        #Calculating Cosine
        ''' 
        1. Finding dot product of mainDoc and searchDoc
        2. Finding lenght of mainDoc and the length of searchDoc
            - length is found by adding power of 2 to the vector, summing the result and finding the squareroot. A vector to the power of 2, is like finding the dot product of A * A
        3. With ||mainDoc|| and ||searchDoc||, we times the result of these two
        4. Divide the output of dot product between the two vectors with the result of 3 which equals the cosine similarity.
        '''
        cosine = np.dot(mainDoc,searcDoc)/(norm(mainDoc)*norm(searcDoc))    
        
        if cosine > 0.6:
            self.add_similar_document(searchObject, cosine)
            searchObject.add_similar_document(self, cosine)
        
        print(f"Cosine Similarity: {cosine:.1%}")
        print(mainDoc, searcDoc)
        print(f'Euclidean distance: {np.linalg.norm(mainDoc-searcDoc)}')
    
    def add_similar_document(self: object, similarDoc: object, cosine_score: int):
        if len(self.similar_documents) == 0:
            self.similar_documents[f'searchDocument_{len(self.similar_documents)+1}'] = {'text': similarDoc.text,
                                                                            'cosine_score': f'{cosine_score:0.3}'}
            return
        
        for key in self.similar_documents.keys():
            if self.similar_documents[key]['text'] == similarDoc.text:
                return
            else:
                self.similar_documents[f'searchDocument_{len(self.similar_documents)+1}'] = {'text': similarDoc.text,
                                                                                         'cosine_score': f'{cosine_score:0.3}'}
                return 

In [26]:
doc_1 = Document("Is this not the most beautiful day you have ever seen it is wonderful" )
doc_2 = Document("What a wonderful wonderful day it is today. I think this is the most beautiful i have ever seen it")

In [70]:
doc_3 = Document("Wonderful wonderful beautiful most day are too nice for today")

In [27]:
doc_1.compareDocuments(doc_2)

Cosine Similarity: 84.9%
[1 1 1 1 1 0 0] [1 1 1 1 2 1 1]
Euclidean distance: 1.7320508075688772
