In [204]:
import re
import numpy as np
import pymupdf #library for extracting text from PDF

# Step 1:
Implement word-based or subwordy tokenization.
- remove special characters and tags from the documents
- identify and justify the list of stop-words and remove them
__
1. `pymupdf.page.get_text(flags=0)` `flags=0` were used to deal with problems of the identification "fi", "ff" subwords  
2. assuming all texts are in the directory as the file
3. stop words were taken from https://countwordsfree.com/stopwords and match with the existed words in the texts and adding some more which appeared due to math formulas in the text
4. all justification to regex in the code block 


In [205]:
doc1 = pymupdf.open('text1.pdf')
text1 = ''
for page in doc1:
    text1 += page.get_text(flags=0)

doc2 = pymupdf.open('text2.pdf')
text2 = ''
for page in doc2:
    text2 += page.get_text(flags=0)

STOP_WORDS = ['page', 'obtain', 'further', 'four', 'are', 'when', 'that', 'how', 'due', 'been', 'indicate', 
              'up', 'over', 'least', 'therefore', 'also', 'novel', 'became', 'to', 'after', 'third', 'allow', 
              'shows', 'useful', 'while', 'way', 'there', 'detail', 'or', 'inner', 'keys', 'amount', 'and', 
              'where', 'usually', 'here', 'related', 'present', 'no', 'results', 'significantly', 'will', 'just', 
              'recent', 'seen', 'become', 'example', 'corresponding', 'into', 'significant', 'took', 'thereafter', 
              'different', 'using', 'despite', 'backward', 'former', 'appropriate', 'thus', 'information', 'within', 
              'it', 'able', 'but', 'less', 'containing', 'have', 'hence', 'noted', 'recently', 'few', 'those', 'having', 
              'was', 'end', 'following', 'let', 'who', 'them', 'states', 'latter', 'little', 'an', 'possible', 'clearly', 
              'quickly', 'appear', 'empty', 'help', 'gives', 'on', 'being', 'forward', 'side', 'important', 'since', 
              'value', 'follows', 'overall', 'followed', 'before', 'effect', 'especially', 'particular', 'rd', 'in', 
              'these', 'lower', 'known', 'part', 'successfully', 'during', 'per', 'although', 'itself', 'our', 'allows', 
              'last', 'research', 'several', 'more', 'very', 'this', 'ones', 'whereas', 'five', 'did', 'has', 'approximately', 
              'again', 'than', 'so', 'eight', 'of', 'around', 'uses', 'otherwise', 'through', 'along', 'respectively', 'million', 
              'describe', 'resulted', 'another', 'similar', 'some', 'because', 'low', 'full', 'all', 'by', 'et', 'al', 'contains', 
              'described', 'does', 'section', 'could', 'two', 'entirely', 'mean', 'throughout', 'available', 'well', 
              'three', 'makes', 'line', 'whole', 'not', 'can', 'whose', 'get', 'find', 'doing', 'why', 'amongst', 'each', 
              'affecting', 'given', 'used', 'opposite', 'already', 'seems', 'de', 'tried', 'cannot', 'need', 'bottom', 
              'as', 'relatively', 'thanks', 'instead', 'becomes', 'previously', 'were', 'consider', 'off', 'with', 
              'much', 'we', 'is', 'at', 'come', 'give', 'obtained', 'above', 'most', 'same', 'both', 'see', 'too', 
              'similarly', 'once', 'made', 'considering', 'would', 'various', 'us', 'then', 'ours', 'call', 'best',
              'out', 'only', 'ca', 'if', 'nearly', 'right', 'somewhat', 'unlike', 'any', 'what', 'many', 'they',
              'according', 'yet', 'must', 'first', 'their', 'other', 'together', 'however', 'twelve', 'except', 'slightly',
              'between', 'needs', 'about', 'added', 'take', 'better', 'every', 'means', 'be', 'importance', 'course', 'even',
              'resulting', 'next', 'words', 'across', 'second', 'inside', 'use', 'often', 'which', 'for', 'should', 'such', 'may',
              'show', 'new', 'do', 'one', 'back', 'from', 'shown', 'sometimes', 'make', 'run', 'found', 'state', 'its', 'the',
              'without', 'furthermore', 'i.e.', 'e.g.', 'wt', 'wo', 'wi', 'vw', 'ch', 'pn', 'wj', 'hs', 'ho', 'vwi', 'ewi', 'san', 
              'chi', 'nam', 'st','ht','xn','zn','ym','dk','dv','qw','kw','xw','pe','xi','zi','rl','ls','el', '...', 'qkt', 'ffn', 'dff',
              'pos', 'moe', 'ppl', 'zhu', 'nal']

def word_based_tokenization(text:str):
    text = text.lower()
    text = re.sub(r"arxiv:.*\n", repl='', string=text) #deleting watermark of arxiv
    text = re.sub(r'^.*?\babstract\b', repl='abstract', string=text, flags=re.DOTALL) #clear all text before abstract due to irrelevance
    text = re.sub(r'\breferences\b\n.*', repl='references', string=text, flags=re.DOTALL) #clear all text in references due to irrelevance
    
    
    text = re.sub(r"\(\w+\)", repl='', string=text) #remove from the text words in the parenthesis (because it's  very probable that it will be math symbols)

    text = re.sub(r"(\d+\.)|(\.\d+)", repl='', string=text) #removing the dot which is next to digits
    text = re.sub(r"\.\n", repl='. ', string=text) #replacing the "." at the end of the para with ". "
    text = re.sub(r"\. ", repl=' ', string=text) #replacing the ". " at the end of the sentence with " "
    
    text = re.sub(r"(\d+-)|(-\d+)", repl='', string=text) #removing the dash which is next to digits
    text = re.sub(r"( +-)|(- +)", repl='', string=text) #removing the dash which is next to space
    text = re.sub(r"(skip-\ngram)", repl='skip-gram', string=text) #dealing with the names transposition
    text = re.sub(r"(high-\nquality)", repl='high-quality', string=text) #dealing with the names transposition
    text = re.sub(r"([-][\n])", repl = '', string=text) #dealing with basic transposition
    
    pattern = r"([a-z\-@\.]+)" #extracting all words from the preprocessed text
    words = re.findall(pattern, text)
    
    tokens = [word for word in words if len(word) > 1 and word  not in STOP_WORDS] #excluding words with length = 1
    
    return tokens

# Step 2:
Implementing the Bag-of-Words model

In [None]:
import pandas as pd

def bag_of_words(corpus:list):
    words = dict()
    min_freq = 2
    
    #creating words dictionary of all words found in the corpus 
    #with their frequencies according to text where they were found
    #ex: {"word":[total, [in_text1, in_text2, ...]]}
    for text in range(len(corpus)):
        for word in corpus[text]:
            if words.get(word) == None:
                maps = [0 for  _ in range(len(corpus))]
                maps[text] += 1
                words[word] = [1, maps]
            else:
                words[word][0] += 1
                words[word][1][text] += 1
    
    frequencies = []
    vocabulary = []
    
    
    # deleting all words that have low frequency
    for word in words.keys():
        if words[word][0] >= min_freq:
            frequencies.append(words[word][1])
            vocabulary.append(word)
            
    bag_of_words_matrix = np.zeros((len(corpus), len(frequencies)), dtype=int)
    
    for freq_i in range(len(frequencies)):
        for text_i in range(len(corpus)):
            bag_of_words_matrix[text_i, freq_i] = frequencies[freq_i][text_i]
    
    return vocabulary, bag_of_words_matrix

function for calculating cosine simmilarity

In [207]:
def cos_sim(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    
    if norm1 == 0 or norm2 == 0:
        return 0.0
    
    return dot_product / (norm1 * norm2)

# Step 3:
Generate vectors for both documents and calculate cosine similarity between them

In [208]:
text1_tokens = word_based_tokenization(text1)
text2_tokens = word_based_tokenization(text2)
corpus = [text1_tokens, text2_tokens]
vocabulary, bag_of_words_texts = bag_of_words(corpus)
vec_text1 = bag_of_words_texts[0]
vec_text2 = bag_of_words_texts[1]

print(cos_sim(vec_text1, vec_text2))

{'abstract': [2, [1, 1]], 'introduced': [7, [7, 0]], 'continuous': [3, [2, 1]], 'skip-gram': [33, [33, 0]], 'model': [75, [33, 42]], 'efficient': [6, [5, 1]], 'method': [7, [7, 0]], 'learning': [14, [9, 5]], 'high-quality': [3, [3, 0]], 'distributed': [3, [3, 0]], 'vector': [23, [23, 0]], 'representations': [45, [37, 8]], 'capture': [1, [1, 0]], 'large': [14, [9, 5]], 'number': [15, [5, 10]], 'precise': [3, [3, 0]], 'syntactic': [4, [3, 1]], 'semantic': [4, [3, 1]], 'word': [44, [44, 0]], 'relationships': [2, [2, 0]], 'paper': [7, [6, 1]], 'extensions': [2, [2, 0]], 'improve': [4, [3, 1]], 'quality': [12, [7, 5]], 'vectors': [28, [26, 2]], 'training': [67, [42, 25]], 'speed': [2, [2, 0]], 'subsampling': [19, [19, 0]], 'frequent': [15, [15, 0]], 'speedup': [3, [3, 0]], 'learn': [9, [4, 5]], 'regular': [1, [1, 0]], 'simple': [14, [12, 2]], 'alternative': [2, [2, 0]], 'hierarchical': [16, [16, 0]], 'softmax': [28, [22, 6]], 'called': [2, [1, 1]], 'negative': [13, [13, 0]], 'sampling': [11

# Step 4:
Interpret measured cosine similarity

In [209]:
for i in range(len(bag_of_words_texts[0])):
    if (vec_text1[i] == 0 or vec_text2[i] == 0):
        print(f"{vocabulary[i]}: {vec_text1[i]}, {vec_text2[i]} - decreased simmilarity")
    elif (abs(vec_text1[i] - vec_text2[i]) >= 10):
        print(f"{vocabulary[i]}: {vec_text1[i]}, {vec_text2[i]} - increased simmilarity a little")
    else:
        print(f"{vocabulary[i]}: {vec_text1[i]}, {vec_text2[i]} - increased simmilarity")

abstract: 1, 1 - increased simmilarity
introduced: 7, 0 - decreased simmilarity
continuous: 2, 1 - increased simmilarity
skip-gram: 33, 0 - decreased simmilarity
model: 33, 42 - increased simmilarity
efficient: 5, 1 - increased simmilarity
method: 7, 0 - decreased simmilarity
learning: 9, 5 - increased simmilarity
high-quality: 3, 0 - decreased simmilarity
distributed: 3, 0 - decreased simmilarity
vector: 23, 0 - decreased simmilarity
representations: 37, 8 - increased simmilarity a little
large: 9, 5 - increased simmilarity
number: 5, 10 - increased simmilarity
precise: 3, 0 - decreased simmilarity
syntactic: 3, 1 - increased simmilarity
semantic: 3, 1 - increased simmilarity
word: 44, 0 - decreased simmilarity
relationships: 2, 0 - decreased simmilarity
paper: 6, 1 - increased simmilarity
extensions: 2, 0 - decreased simmilarity
improve: 3, 1 - increased simmilarity
quality: 7, 5 - increased simmilarity
vectors: 26, 2 - increased simmilarity a little
training: 42, 25 - increased simm