# Install dependency

In [None]:
# a = []
# while(1):
#     a.append('1')

In [None]:
!pip install top2vec[sentence_encoders]
!pip install top2vec[sentence_transformers]



# Import

In [None]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from top2vec import Top2Vec
import pickle
import numpy as np
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Initialization

In [None]:
DATASET = 'Dataset-yahoo-answer'
PATH = '/content/gdrive/MyDrive/'+DATASET
NUM_TOPICS = 20 # used for topic reduction 
COLUMN = 'answer' # column to use for topic modeling
EMBEDDING = 'bert'
VERSION = '1.0'

# Read data

In [None]:
df_train = pd.read_csv('/content/gdrive/MyDrive/'+DATASET+'/test.csv', header=None ,names=['topic', 'title', 'question', 'answer'])
# df_train = pd.read_csv('/content/gdrive/MyDrive/'+DATASET+'eq_bank_training_training_final.csv')

In [None]:
# df_train = df_train.groupby("parent_id").sample(frac=0.05, random_state=2)

In [None]:
df_train

Unnamed: 0,topic,title,question,answer
0,9,What makes friendship click?,How does the spark keep going?,good communication is what does it. Can you m...
1,2,Why does Zebras have stripes?,What is the purpose or those stripes? Who do t...,this provides camouflage - predator vision is ...
2,4,What did the itsy bitsy sipder climb up?,,waterspout
3,4,What is the difference between a Bachelors and...,,One difference between a Bachelors and a Maste...
4,3,Why do women get PMS?,,Premenstrual syndrome (PMS) is a group of symp...
...,...,...,...,...
59995,9,"if you could be any fantasy figure, who would ...",,"The invisible man, I'd be straight into the gi..."
59996,8,Tell me something about life most people don't...,"Do you know anything about life, or words of w...",That there is a hell and everyone thinks their...
59997,3,Why are men always thinking of sex?,,It's wired in our brain
59998,6,est ce que DOMENECH est un entraineur: 1: de f...,,de foot mais pas pour être sélectionneur d'une...


# Preprocess data

In [None]:
def preprocess_tokenize(s):
    '''
    This function takes a string as an input and then preprocess the string.
    The preprocessing involves  
        1. removing hyperlinks, 
        2. making all letters lower-case
        3. removing all punctuations, special characters and digits
        4. tokenization 
        5. lemmatization
    Inputs:
        s: s is a string
    returns:
        processed_string: processed string
    '''
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
    
    cleanr = re.compile('<.*?>')
    s = re.sub(cleanr, '', s)      #removing html tags       
    s = re.sub(r"http\S+", "", s)  #removing hyperlinks
    s = re.sub(r"www\S+", "", s)   #removing hyperlinks
    s = re.sub(r"\\n", "", s)      #removing \n 
    s = s.lower()  
    s = tokenizer.tokenize(s)
    
    processed_string = ''
    for word in s:
        if word not in stop_words:
            word = lemmatizer.lemmatize(word, 'v')
            word = lemmatizer.lemmatize(word, 'n')
            word = lemmatizer.lemmatize(word, 'a')
            processed_string += word
            processed_string += ' '
            
    return processed_string.strip()
            

In [None]:
# pre - processing 
df_train['processed_'+COLUMN] = df_train[COLUMN].apply(lambda x: preprocess_tokenize(str(x)))
df_train.head(5)

Unnamed: 0,topic,title,question,answer,processed_answer
0,9,What makes friendship click?,How does the spark keep going?,good communication is what does it. Can you m...,good communication move beyond small talk say ...
1,2,Why does Zebras have stripes?,What is the purpose or those stripes? Who do t...,this provides camouflage - predator vision is ...,provide camouflage predator vision usually dif...
2,4,What did the itsy bitsy sipder climb up?,,waterspout,waterspout
3,4,What is the difference between a Bachelors and...,,One difference between a Bachelors and a Maste...,one difference bachelor master degree requirem...
4,3,Why do women get PMS?,,Premenstrual syndrome (PMS) is a group of symp...,premenstrual syndrome pm group symptom relate ...


# Train Model

In [None]:
def create_top2vec(texts):
    '''
    This function train the Top2Vec model
    
    Inputs:
        texts: a list of documents where each document is a string
        
    Returns:
        model: a trained Top2Vec model
    '''
    if (EMBEDDING == 'universal'):
      model = Top2Vec(embedding_model='universal-sentence-encoder', documents = texts)
    elif (EMBEDDING == 'bert'):
      model = Top2Vec(embedding_model='distiluse-base-multilingual-cased', documents = texts)
    else:
      model = Top2Vec(embedding_model='universal-sentence-encoder-multilingual', documents = texts)
   
    return model

In [None]:
def get_doc_topics_reduced(data_df, model):
    '''
    As Top2Vec can generate amny topics, this function reduces the number of topics generated by Top2Vec
    
    Inputs:
        data_df: pandas dataframe containing all texts
        
        model: trained Top2Vec model
        
    Returns:
        topic_nums: numpy array of (n,) dimension where n is the number of documents. It contains the topic for corresponding document
        
        topic_scores: numpy array of (n,) dimension where n is the number of documents. Each entry is the cosine similarity of
            the document and topic vector
            
        topic_words: array of shape(num_topics, 50)
            For each topic the top 50 words are returned, in order
            of semantic similarity to topic.
            
        word_scores: array of shape(num_topics, 50)
            For each topic the cosine similarity scores of the
            top 50 words to the topic are returned.
    
    '''
    doc_idx = np.arange(0, len(data_df))
    topic_nums, topic_scores, topic_words, word_scores = model.get_documents_topics(doc_idx, reduced=True)
    
    return topic_nums, topic_scores, topic_words, word_scores

In [None]:
# applying pre-processing
processed_texts = df_train['processed_'+COLUMN].tolist()
# without pre - processing
# processed_texts = df_train[COLUMN].tolist()

In [None]:
model = create_top2vec(processed_texts)

2021-04-09 01:09:11,084 - top2vec - INFO - Pre-processing documents for training
2021-04-09 01:09:19,435 - top2vec - INFO - Downloading distiluse-base-multilingual-cased model


HBox(children=(FloatProgress(value=0.0, max=503702349.0), HTML(value='')))




2021-04-09 01:09:52,305 - top2vec - INFO - Creating joint document/word embedding
2021-04-09 02:22:08,502 - top2vec - INFO - Creating lower dimension embedding of documents
2021-04-09 02:24:04,008 - top2vec - INFO - Finding dense areas of documents
2021-04-09 02:24:11,155 - top2vec - INFO - Finding topics


In [None]:
model.get_num_topics()

326

In [None]:
doc_idx = np.arange(0, len(df_train))
topic_nums, topic_scores, topic_words, word_scores = model.get_documents_topics(doc_idx)
num_prominent_topics = len(np.unique(topic_nums))
print(num_prominent_topics)

326


In [None]:
df_train['pred_topics'] = topic_nums

# Topic reduction

In [None]:
hierrarchy = model.hierarchical_topic_reduction(num_topics=NUM_TOPICS)

In [None]:
topic_nums, topic_scores, topic_words, word_scores = get_doc_topics_reduced(df_train, model)

In [None]:
df_train['reduced_topic'] = topic_nums
df_train['reduced_topic_score'] = topic_scores

In [None]:
df_train

Unnamed: 0,topic,title,question,answer,processed_answer,pred_topics,reduced_topic,reduced_topic_score
0,9,What makes friendship click?,How does the spark keep going?,good communication is what does it. Can you m...,good communication move beyond small talk say ...,7,7,0.669693
1,2,Why does Zebras have stripes?,What is the purpose or those stripes? Who do t...,this provides camouflage - predator vision is ...,provide camouflage predator vision usually dif...,209,6,0.288386
2,4,What did the itsy bitsy sipder climb up?,,waterspout,waterspout,53,1,0.565577
3,4,What is the difference between a Bachelors and...,,One difference between a Bachelors and a Maste...,one difference bachelor master degree requirem...,1,12,0.582758
4,3,Why do women get PMS?,,Premenstrual syndrome (PMS) is a group of symp...,premenstrual syndrome pm group symptom relate ...,65,0,0.413381
...,...,...,...,...,...,...,...,...
59995,9,"if you could be any fantasy figure, who would ...",,"The invisible man, I'd be straight into the gi...",invisible man straight girl change room,26,4,0.314743
59996,8,Tell me something about life most people don't...,"Do you know anything about life, or words of w...",That there is a hell and everyone thinks their...,hell everyone think go world go dont turn god ...,14,6,0.498708
59997,3,Why are men always thinking of sex?,,It's wired in our brain,wire brain,278,2,0.410009
59998,6,est ce que DOMENECH est un entraineur: 1: de f...,,de foot mais pas pour être sélectionneur d'une...,de foot mais pa pour tre lectionneur une quipe...,25,10,0.503376


# Save 

In [None]:
# recommended way to save model
model.save(PATH+'/top2vec/saved/'+DATASET+'_to2vec_'+EMBEDDING+'_'+VERSION+'.mdl')

In [None]:
# another way to save model
with open(PATH+'/top2vec/saved/'+DATASET+'_top2vec_'+EMBEDDING+'_'+VERSION+'.model', 'wb') as file:
    pickle.dump(model, file)

In [None]:
# save data
with open(PATH+'/top2vec/saved/'+DATASET+'_data'+'_'+VERSION+'.pkl', 'wb') as file:
    pickle.dump(df_train, file)

In [None]:
df_train.to_csv('eq_bank.csv')