In [18]:
from dataclasses import dataclass
from typing import List
import numpy as np

import spacy
from spacy.tokens.span import Span

In [None]:
nlp = spacy.load("en_core_web_md")

In [106]:
text_example = 'The large color photograph that greets visitors to a National Archives exhibit celebrating the centennial of women’s suffrage shows a massive crowd filling Pennsylvania Avenue NW for the Women’s March on Jan. 21, 2017, the day after President Trump’s inauguration. \
The 49-by-69-inch photograph is a powerful display. Viewed from one perspective, it shows the 2017 march. Viewed from another angle, it shifts to show a 1913 black-and-white image of a women’s suffrage march also on Pennsylvania Avenue. The display links momentous demonstrations for women’s rights more than a century apart on the same stretch of pavement. \
But a closer look reveals a different story. \
The Archives acknowledged in a statement this week that it made multiple alterations to the photo of the 2017 Women’s March showcased at the museum, blurring signs held by marchers that were critical of Trump. Words on signs that referenced women’s anatomy were also blurred.'

text_storage = [
    'This year marks the start of a new decade, and the end of a generation of video games. Sony and Microsoft are set to launch new systems during the holidays, marking the beginning of a new phase in gaming. But that’s good news: Traditionally, the games launched at the end of a generation are among that generation’s best.',
    'Apple’s stance on encryption and user privacy is once again under the spotlight as the Federal Bureau of Investigation seeks the company’s help in decrypting two iPhones used by a Royal Saudi Air Force cadet who opened fire at a naval base in Pensacola, Fl. in December, killing three people before he himself was killed by police.',
]

texts_index = {}

def process_sentence(sentence):
    result = []
    for token in sentence:
        if token.text.lower() in nlp.Defaults.stop_words:
            continue
        if token.is_punct:
            continue
        if token.lemma_ == '-PRON-':
            continue
        result.append(token.lemma_)
    return nlp(" ".join(result))
        

def add_text_to_index(key, text):
    doc = nlp(text)
    texts_index[key] = {
        'origin': text,
        'sentences': [],
        'vectors': []
    }
    for sent in doc.sents:
        processed_sentence = process_sentence(sent)
        texts_index[key]['sentences'].append({
            'origin': str(sent),
            'processed': str(processed_sentence)
        })
        texts_index[key]['vectors'].append(processed_sentence.vector)
        
    texts_index[key]['vectors'] = np.array(texts_index[key]['vectors'])
    
for i, text in enumerate(text_storage):
    add_text_to_index(i, text)


In [112]:
def search_similar_texts(sentence_vec, texts_vectors, result=None):
    result = result or []
    for key, vectors in texts_vectors.items():
        result.append({
            'text_id': key,
            'score': np.median(cos_matrix_multiplication(vectors, sentence_vec)) 
        })
    result.sort(key=lambda x: x['score'], reverse=True)
    return result

def cos_matrix_multiplication(matrix, vector):
    """
    Calculating pairwise cosine distance using matrix vector multiplication.
    """
    dotted = matrix.dot(vector)
    matrix_norms = np.linalg.norm(matrix, axis=1)
    vector_norm = np.linalg.norm(vector)
    matrix_vector_norms = np.multiply(matrix_norms, vector_norm)
    neighbors = np.divide(dotted, matrix_vector_norms)
    return neighbors


doc = nlp(text_example)
processed_sentence = process_sentence(next(doc.sents))
sentence_vec = processed_sentence.vector
search_result = search_similar_texts(sentence_vec, {i: text['vectors'] for i, text in texts_index.items()})
print(search_result)



[{'text_id': 2, 'score': 0.47176138}, {'text_id': 1, 'score': 0.38226873}, {'text_id': 0, 'score': 0.33044702}]
