In [None]:
import math
import sys
import string

In [None]:
def read_file(htmldata): 
      
    try:
        with open(htmldata, 'r',encoding='utf-8') as f:
            data = f.read()
        return data
      
    except IOError:
        print("Error opening or reading input file: ", htmldata)
        sys.exit()
  
# splitting the text lines into words
# translation table is a global variable
# mapping upper case to lower case and
# punctuation to spaces
translation_table = str.maketrans(string.punctuation+string.ascii_uppercase,
                                     " "*len(string.punctuation)+string.ascii_lowercase)
       
# returns a list of the words
# in the file
def get_words_from_line_list(text): 
      
    text = text.translate(translation_table)
    word_list = text.split()
      
    return word_list
  
  
# counts frequency of each word
# returns a dictionary which maps
# the words to  their frequency.
def count_frequency(word_list): 
      
    D = {}
      
    for new_word in word_list:
          
        if new_word in D:
            D[new_word] = D[new_word] + 1
              
        else:
            D[new_word] = 1
              
    return D
  
# returns dictionary of (word, frequency)
# pairs from the previous dictionary.
def word_frequencies_for_file(filename): 
      
    line_list = read_file(filename)
    word_list = get_words_from_line_list(line_list)
    freq_mapping = count_frequency(word_list)
  
    print("File", filename, ":", )
    print(len(line_list), "lines, ", )
    print(len(word_list), "words, ", )
    print(len(freq_mapping), "distinct words")
  
    return freq_mapping
  
  
# returns the dot product of two documents
def dotProduct(D1, D2): 
    Sum = 0.0
      
    for key in D1:
          
        if key in D2:
            Sum += (D1[key] * D2[key])
              
    return Sum
  
# returns the angle in radians 
# between document vectors
def vector_angle(D1, D2): 
    numerator = dotProduct(D1, D2)
    denominator = math.sqrt(dotProduct(D1, D1)*dotProduct(D2, D2))
    #numerator = math.sqrt(dotProduct(D1, D1)*dotProduct(D2, D2))
    #denominator = dotProduct(D1, D2) 
    return math.acos(numerator / denominator )
    #return math.acos(denominator / numerator )
      
def documentSimilarity(htmldata, htmldata2):
      
   # filename_1 = sys.argv[1]
   # filename_2 = sys.argv[2]
    sorted_word_list_1 = word_frequencies_for_file(htmldata)
    sorted_word_list_2 = word_frequencies_for_file(htmldata2)
    distance = vector_angle(sorted_word_list_1, sorted_word_list_2)
      
    print("The similarity between the documents is: % 0.6f "% distance)
# Driver code
documentSimilarity('htmldata.txt', 'htmldata2.txt')                                  

In [20]:
import math
import re
from collections import Counter

WORD = re.compile(r"\w+")
def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator


def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

def read_file(htmldata): 
    with open(htmldata, 'r',encoding='utf-8') as f:
            data = f.read()
            return data

def documentSimilarity(htmldata, htmldata2):
    text1= htmldata
    text2 = htmldata2
    vector1 = text_to_vector(text1)
    vector2 = text_to_vector(text2)
    cosine = get_cosine(vector1, vector2)
    print("Similarity Score: %",cosine*100)
mydata = read_file("htmldata.txt")
mydata2 = read_file("htmldata2.txt")
documentSimilarity(mydata,mydata2)    

Similarity Score: % 100.0


In [1]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
model = SentenceTransformer('stsb-roberta-large')

In [2]:
def read_file(htmldata): 
    with open(htmldata, 'r',encoding='utf-8') as f:
            data = f.read()
            return data
def documentSimilarity(htmldata, htmldata2):       
    file1 = htmldata
    file2 = htmldata2
# encode sentences to get their embeddings
    embedding1 = model.encode(file1, convert_to_tensor=True)
    embedding2 = model.encode(file2, convert_to_tensor=True)
# compute similarity scores of two embeddings
    x = util.pytorch_cos_sim(embedding1, embedding2)
    print("Similarity Score: %", x.item())
mydata = read_file("htmldata.txt")
mydata2 = read_file("htmldata2.txt")
documentSimilarity(mydata,mydata2)    

Similarity Score: % 0.5538814067840576
