In [1]:
! pip install spacy
! python -m spacy download en_core_web_sm
! pip install top2vec[sentence_transformers]
! pip install top2vec[sentence_encoders]
! pip install tensorflow tensorflow_hub tensorflow_text
! pip install colored

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting top2vec[sentence_transformers]
  Downloading top2vec-1.0.34-py3-none-any.whl (27 kB)
Collecting umap-learn>=0.5.1 (from top2vec[sentence_transformers])
  Downloading umap-learn-0.5.5.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.9/90.9 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting hdbscan>=0.8.27 (from top2vec[sentence_transformers])
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m20.3 MB

In [2]:
import pandas as pd
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
import numpy as np
from top2vec import Top2Vec
from colored import Fore, Back, Style

In [3]:
df = pd.read_csv('/content/biology_new.csv')
df = df[['text', 'title', 'date', 'link', 'cleaned_text', 'fully_cleaned_text']]
df.head()

Unnamed: 0,text,title,date,link,cleaned_text,fully_cleaned_text
0,FOR THE past four billion years or so the only...,The promise and perils of synthetic biology,Apr 4th 2019,https://www.economist.com/leaders/2019/04/04/t...,for the past four billion years or so the only...,past four billion years way life earth produce...
1,IN A former leatherworks just off Euston Road ...,Will artificial intelligence help to crack bio...,Jan 7th 2017,https://www.economist.com/science-and-technolo...,in a former leatherworks just off euston road ...,former leatherworks euston road london hopeful...
2,“How many cells are there in a human being?” I...,The idea of “holobionts” represents a paradigm...,Jun 14th 2023,https://www.economist.com/science-and-technolo...,how many cells are there in a human being it...,many cells human sounds like question nerdy pu...
3,LIVING creatures are jolly useful. Farmers rea...,The remarkable promise of cell-free biology,May 4th 2017,https://www.economist.com/leaders/2017/05/04/t...,living creatures are jolly useful farmers rear...,living creatures jolly useful farmers rear ani...
4,"A broken brain, hidden inside a skull, is hard...",Better brain biology will deliver better medic...,Sep 21st 2022,https://www.economist.com/technology-quarterly...,a broken brain hidden inside a skull is harder...,broken brain hidden inside skull harder diagno...


***quotations:***
1. "Some are born great. Some achieve greatness. Some have greatness thrust upon them." (W. Shakespeare, "Twelfth Night" Act II Scene 5)
2. “So, naturalists observe, a flea/Hath smaller fleas that on him prey”.(Jonathan Swift...)
3. “And these have smaller fleas to bite ’em/And so proceed ad infinitum” (Jonathan Swift...)
4. "...to hold infinity in the palm of a hand, and eternity in an hour." (William Blake ...)


In [4]:
model = Top2Vec.load("top2vec_model")

In [5]:
topic_sizes = model.get_topic_sizes()
topic_sizes

(array([338, 259, 172,  77,  74,  42]), array([0, 1, 2, 3, 4, 5]))

So, naturalists observe, a flea/Hath smaller fleas that on him prey.

In [6]:
# Получение информации о количестве документов в каждой теме и их индексах
topic_sizes = model.get_topic_sizes()

for topic_num, num_docs_in_topic in enumerate(topic_sizes[0]):
    topic_index = topic_sizes[1][topic_num]  # Индекс текущей темы

    documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=topic_index, num_docs=num_docs_in_topic)

    for text, score, doc_id in zip(documents, document_scores, document_ids):
        matcher = Matcher(nlp.vocab)
        pattern = [{'LEMMA': "naturalist"}, {'POS': "VERB"}] # Шаблон для поиска искомой фразы
        matcher.add("naturalist_PATTERN", [pattern])

        doc = nlp(str(text))
        matches = matcher(doc)

        if len(matches) > 0:
            result = [doc[start - 10:end + 10].text for match_id, start, end in matches][0].split()
            highlighted_phrase = ' '.join(result[10:12]) # Срез искомой фразы
            highlighted_text = ' '.join(result[:10]) + f' {Fore.RED}{Back.WHITE}{highlighted_phrase}{Style.RESET} ' + ' '.join(result[-10:])
            print("doc_id", doc_id, "Matches:", highlighted_text)

doc_id 780 Matches: see hybrids missing mechanism start 19th century jeanbaptiste lamarck french [38;5;1m[48;5;15mnaturalist thought[0m found recognised species mutable also proposed traits could inherited error
doc_id 372 Matches: creature relatives seemed untroubled virusesas jonathan swift put muchmisquoted poem [38;5;1m[48;5;15mnaturalists observe[0m fleahath smaller fleas prey parasites words everywhere also usually abundant
doc_id 524 Matches:  [38;5;1m[48;5;15m[0m 
doc_id 162 Matches: never climbed richard preston author several bestsellers tells story botanists [38;5;1m[48;5;15mnaturalists committed[0m preserving remaining fragment 2macre 810000 hectare redwood forest book revolves
doc_id 168 Matches: dr aplin following work published 1934 henry smith williams american [38;5;1m[48;5;15mnaturalist noticed[0m put various coloured balls yarn garden almost always one one


In [7]:
idx = 372
res_df = df.iloc[idx, :]
res_df['text']


'WHAT is the commonest living thing on Earth? Until now, those in the know would probably have answered Pelagibacter ubique, the most successful member of a group of bacteria, called SAR11, that jointly constitute about a third of the single-celled organisms in the ocean. But this is not P. ubique’s only claim to fame, for unlike almost every other known cellular creature, it and its relatives have seemed to be untroubled by viruses.As Jonathan Swift put it in a much-misquoted poem, “So, naturalists observe, a flea/Hath smaller fleas that on him prey”. Parasites, in other words, are everywhere. They are also, usually, more abundant than their hosts. An astute observer might therefore have suspected that the actual most-common species on Earth would be a “flea” that parasitised P. ubique, rather than the bacterium itself. The absence of such fleas (in the form of viruses called bacteriophages, that attack bacteria) has puzzled virologists since 1990, when the SAR11 group was identified.

In [8]:
res_df

text                  WHAT is the commonest living thing on Earth? U...
title                                                       Flea market
date                                                      Feb 16th 2013
link                  https://www.economist.com/science-and-technolo...
cleaned_text          what is the commonest living thing on earth un...
fully_cleaned_text    commonest living thing earth know would probab...
Name: 372, dtype: object