### 1. Extract subject from document

#### The first approach to identify the subject of a document is to find the intersection between nouns and named entites.
#### 1. First I will find all nouns from text using pos tagger from nltk library and select top 10 of them.
#### 2. Then, I will find all named entities using ner from nltk library and again select top 10 entites.
#### 3. Finally, I will find their intersection and return the enties which are in the list of top nouns
#### This would give me potential subject of the document

In [1]:
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('averaged_perceptron_tagger')
nltk.download('words')
from nltk.corpus import stopwords
stop = stopwords.words('english')
from collections import Counter

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Qquentin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Qquentin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Qquentin\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Qquentin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Qquentin\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
def clean_tokenize_data(text):
    text = re.sub('[^A-Za-z -]+', ' ', text)
    text = ' '.join(text.split())
    text = ' '.join([i for i in text.split() if i not in stop])
    sents = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)]
    
    return sents

In [3]:
def get_common_nouns(text):
    # get common words
    words = []
    for sent in text:
        for word in sent:
            words.append(word)
    word_counter = Counter(words)
    common_words = word_counter.most_common(10)
    
    # get common nouns
    common_nouns = []
    for w, c in common_words:
        if nltk.pos_tag([w])[0][1] in ['NN', 'NNS', 'NNP', 'NNPS']:
            common_nouns.append(w.lower())
            
    return common_nouns

In [4]:
def get_common_named_entities(sents):
    # extract all named entities
    named_entities = []
    pos_sents = [nltk.pos_tag(sent) for sent in sents]
    for pos_sent in pos_sents:
        for chunk in nltk.ne_chunk(pos_sent):
            if type(chunk) == nltk.tree.Tree:
                named_entities.append(' '.join([c[0] for c in chunk]).lower())
                
    # get only the most common entities
    entity_counter = Counter(named_entities)
    common_entities = entity_counter.most_common(10)
    common_word_entities = [word for word, counts in common_entities]
    
    return common_word_entities

In [5]:
def get_document_subject(document):
    sents = clean_tokenize_data(document)
    common_nouns = get_common_nouns(sents)
    common_named_entities = get_common_named_entities(sents)
    print('common nouns from text: ', common_nouns)
    print('common named entities from text: ', common_named_entities)
    subject = []
    for named_entity in common_named_entities:
        if named_entity in common_nouns:
            subject.append(named_entity)
    return subject 

In [6]:
with open('document.txt', 'r', encoding="utf8") as f:
    document = f.read()

In [9]:
get_document_subject(document)

common nouns from text:  ['jots', 'time', 'tool', 'task', 'projects', 'content', 'jot', 'everything']
common named entities from text:  ['jots', 'jot jot', 'jeremy apple park jots', 'sdk']


['jots']

In [13]:
from min_dalle import MinDalle

Dalle_model = MinDalle(is_mega=False, is_verbose=False)

In [None]:
image_stream = Dalle_model.generate_image_stream(
    text='jots time tool task projects content jot everything',
    seed=-1,
    grid_size=3,
    progressive_outputs=True,
    is_seamless=False,
    temperature=1,
    top_k=256,
    supercondition_factor=16,
    is_verbose=False
)

for image in image_stream:
    display(image)

#### The second approach is to use language models to extract not just words but key phrases from text (e.g. https://arxiv.org/abs/2112.08547)
#### I can load pretrained model which extracts key words, but of course it is possible to implement a custom model myself

In [59]:
from transformers import TokenClassificationPipeline, AutoModelForTokenClassification, AutoTokenizer
from transformers.pipelines import AggregationStrategy
import numpy as np

In [60]:
class KeyphraseExtractionPipeline(TokenClassificationPipeline):
    def __init__(self, model, *args, **kwargs):
        super().__init__(
            model=AutoModelForTokenClassification.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model),
            *args,
            **kwargs)

    def postprocess(self, model_outputs):
        results = super().postprocess(
            model_outputs=model_outputs,
            aggregation_strategy=AggregationStrategy.SIMPLE,
        )
        return np.unique([result.get("word").strip() for result in results])

In [10]:
model_name = "ml6team/keyphrase-extraction-kbir-inspec"
extractor = KeyphraseExtractionPipeline(model=model_name)

Downloading config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/2.01M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [61]:
extractor(document)

array(['C', 'Interfax news agency', 'Kamchatka peninsula',
       'Klyuchevskaya Sopka volcano', 'Kyrgyzstan', 'Russia',
       'Temperatures', 'Tian Shan mountains',
       'Unesco world heritage site', 'avalanche',
       'civil defence authority', 'freezing winds', 'gale force winds',
       'kaya Sopka', 'satellite phone'], dtype='<U27')

### 2. Find similar text from document

#### For this problem we need embeddings. We can either use word vectors and average them out or use sentence embeddings
#### I decided to use sentence embeddings from HuggingFace which uses SentenceBERT (https://arxiv.org/abs/1908.10084)
#### Then, I will find the closest sentence in the document for the given query

In [62]:
from sentence_transformers import SentenceTransformer, util

In [63]:
def clean_tokenize_data(text):
    text = re.sub('[^A-Za-z -.]+', ' ', text)
    text = ' '.join(text.split())
    text = ' '.join([i for i in text.split() if i not in stop])
    sents = nltk.sent_tokenize(text)
    sents = [' '.join(nltk.word_tokenize(sent)) for sent in sents]
    
    return sents

In [64]:
def get_similarity_scores(model, sents):
    query = sents[1]
    score_list = []
    query_embedding = model.encode(query, convert_to_tensor=True)
    for idx in range(len(sents)):
        embedding = model.encode(sents[idx], convert_to_tensor=True)  
        score = util.pytorch_cos_sim(query_embedding, embedding).item()
        #print(sents[idx], util.pytorch_cos_sim(query_embedding, embeddings_list[idx]).item())
        score_list.append((idx, score))
        
    sorted_scores = sorted(score_list, key=lambda x:(-x[1], x[0]))
    return sorted_scores

In [66]:
with open('document.txt', 'r', encoding="utf8") as f:
    document = f.read()

In [67]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
top = 5

sents = clean_tokenize_data(document)
sorted_scores = get_similarity_scores(model, sents)

print('Top closest sentences to the query "%s" are the followings:' %sents[1])
print('>>>>>>')
for idx, score in sorted_scores[1:top]:
    print('"%s" With the score: ' %sents[idx], score)

Top closest sentences to the query "Six people initially reported killed , according officials Kamchatka peninsula , six believed stranded party , included two guides ." are the followings:
>>>>>>
"Eight climbers died attempting scale Klyuchevskaya Sopka volcano Russia far east , according local officials , freezing winds halted rescue attempt ." With the score:  0.4609769582748413
"But Interfax news agency said two died , quoting Roman Vasilevsky , Kamchatka territory deputy prime minister ." With the score:  0.4139235019683838
"The party set Tuesday climb mountain Eurasia highest active volcano ran trouble Saturday group fell death almost , m , authorities said ." With the score:  0.3748098313808441
"One person thought broken leg , added ." With the score:  0.3098146915435791
