# Install dependencies

In [1]:
!pip install top2vec[sentence_encoders]
!pip install top2vec[sentence_transformers]



# Mount Drive

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')


Mounted at /content/gdrive


# Import

In [4]:
import pandas as pd
import numpy as np
import pickle
from top2vec import Top2Vec
import re
import tensorflow_text


# Initialization

In [5]:
DATASET = 'Dataset-yahoo-answer'
PATH = '/content/gdrive/MyDrive/'+DATASET+'/'
TOPNWORDS = 20 # topic words
EMBEDDING = 'bert'
COLUMN_QS = 'paraphrase'
COLUMN_ANS = 'parent_id'

# Load model

In [6]:
with open(PATH+'top2vec/saved/top2vec.model', 'rb') as file:
    model = pickle.load(file)

with open(PATH+'top2vec/saved/data.pkl', 'rb') as file:
    df = pickle.load(file)

topic_name = pd.read_pickle(PATH+'top2vec/saved/dictionary_topic_name.dict')

In [7]:
df = df.reset_index(drop=True)
df

Unnamed: 0,topic,title,question,answer,processed_answer,pred_topics,reduced_topic,reduced_topic_score
0,9,What makes friendship click?,How does the spark keep going?,good communication is what does it. Can you m...,good communication move beyond small talk say ...,16,0,0.542340
1,2,Why does Zebras have stripes?,What is the purpose or those stripes? Who do t...,this provides camouflage - predator vision is ...,provide camouflage predator vision usually dif...,41,2,0.258722
2,4,What did the itsy bitsy sipder climb up?,,waterspout,waterspout,56,8,0.403495
3,4,What is the difference between a Bachelors and...,,One difference between a Bachelors and a Maste...,one difference bachelor master degree requirem...,24,5,0.503336
4,3,Why do women get PMS?,,Premenstrual syndrome (PMS) is a group of symp...,premenstrual syndrome pm group symptom relate ...,46,3,0.441360
...,...,...,...,...,...,...,...,...
59995,9,"if you could be any fantasy figure, who would ...",,"The invisible man, I'd be straight into the gi...",invisible man straight girl change room,66,11,0.230436
59996,8,Tell me something about life most people don't...,"Do you know anything about life, or words of w...",That there is a hell and everyone thinks their...,hell everyone think go world go dont turn god ...,0,15,0.462794
59997,3,Why are men always thinking of sex?,,It's wired in our brain,wire brain,19,8,0.495684
59998,6,est ce que DOMENECH est un entraineur: 1: de f...,,de foot mais pas pour être sélectionneur d'une...,de foot mais pa pour tre lectionneur une quipe...,218,6,0.399540


# Helper Functions

In [8]:
def infer_topics(text, topNwords, num_docs, reduced): 
    model.add_documents([text])
    new_doc_id = len(model.documents) - 1
    
    if reduced: 
        result = model.get_documents_topics([new_doc_id], reduced = True)
        topic_number = result[0][0]
        word_list = model.topic_words_reduced[result[0][0]][:topNwords]
        similar_docs = model.search_documents_by_documents([new_doc_id], num_docs)
    else:
        result = model.get_documents_topics([new_doc_id])
        topic_number = result[0][0]
        word_list = model.topic_words[result[0][0]][:topNwords]
        similar_docs = model.search_documents_by_documents([new_doc_id],num_docs)
    
    model.delete_documents([new_doc_id])

    return word_list, topic_number, similar_docs

In [9]:
def add_docs(text):
    model.add_documents([text])

In [10]:
def search_for_topics(keywords, num_topics, reduced):
    if reduced:
        topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=keywords, num_topics=num_topics, reduced = True)
    else:
        topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=[ "business", "finance"], num_topics=num_topics)
    
    for n in topic_nums:
        if topic_name[n] != '_':
            print(topic_name[n])
            break
    
    return topic_words, word_scores, topic_scores, topic_nums

In [11]:
def search_for_documents_by_keywords(keywords, num_docs):
    # this does not have any reduced parameter
    documents, document_scores, document_ids = model.search_documents_by_keywords(keywords=keywords, num_docs=num_docs)
    for doc, score, doc_id in zip(documents, document_scores, document_ids):
        print(f"Document: {doc_id}, Score: {score}")
        print("-----------")
        print(doc)
        print("-----------")
        print()
    return documents, document_scores, document_ids

In [12]:
def search_for_keywords(keywords_pos, keywords_neg, num_words):
    # this does not have any reduced parameter
    words, word_scores = model.similar_words(keywords=keywords_pos, keywords_neg=keywords_neg, num_words=num_words)
    for word, score in zip(words, word_scores):
        print(f"{word} {score}")
    return words, word_scores

# Inference

In [28]:
text = r'''
Ontario reported 4,156 new cases of COVID-19 and the deaths of 28 more people with the illness on Wednesday, while public health units administered a new record-high number of vaccine doses.
'''

In [29]:
word_list, topic_number, similar_document = infer_topics(text, topNwords = TOPNWORDS, num_docs = 10, reduced = True)

In [30]:
topic_name[topic_number]

'health'

## Search for Topics by Keywords

In [33]:
topic_words, word_scores, topic_scores, topic_nums = search_for_topics(keywords = ['finance', 'stock', 'money'],num_topics = 10, reduced = True)

business


## Search for related Documents by Keywords

In [34]:
documents, document_scores, document_ids = search_for_documents_by_keywords(keywords = ['business', 'diabetes'], num_docs = 5)

Document: 23392, Score: 0.5093256331546931
-----------
life
-----------

Document: 47558, Score: 0.5025011939387787
-----------
find diabetic
-----------

Document: 8254, Score: 0.4907497425179201
-----------
professional
-----------

Document: 16613, Score: 0.48704823216284576
-----------
health problem
-----------

Document: 1499, Score: 0.47717358684168376
-----------
money
-----------



In [35]:
df.iloc[59177]['answer']

'If it is a psychiatric hospital it is!'

# Search for similar keywords by keywords

In [36]:
words, word_scores = search_for_keywords(keywords_pos = ['health'], keywords_neg=None, num_words=10)

healthy 0.6893834201415286
fitness 0.582503345940204
medical 0.5744456364324823
illness 0.5594899883472886
disease 0.5336805571374086
nutrition 0.5271034051463707
care 0.5244941710352499
lifestyle 0.5131066954114256
business 0.511849214834601
food 0.5035717969279957


# Search for similar documents by documents

In [37]:
document_text = '''
what is computer
'''

In [38]:
word_list, topic_number, similar_document = infer_topics(document_text, topNwords = 10, num_docs = 10, reduced = True)

In [39]:
similar_document

(array(['whats', 'whats', 'pcworld com computer', 'stand alone computer',
        'whats question', 'vedeos whats', 'dont add computer calculater',
        'describe homework',
        'depend define word computer succession steadily powerful flexible compute device construct gradually add key feature modern computer use digital electronics invent claude shannon flexible programmability define one point along road first computer exceedingly difficult notable achievement include atanasoff berry computer special purpose machine use valve drive vacuum tube computation binary number regenerative memory secret british colossus computer limit programmability demonstrate device use thousand valve could make reliable reprogrammed electronically american eniac one first general purpose machine still use decimal system incorporate inflexible architecture mean reprogramming essentially require rewire konrad zuse z machine electromechanical z first work machine feature automatic binary arithmetic 