# Install dependency

In [None]:
!pip install gensim==3.7.1

Collecting gensim==3.7.1
[?25l  Downloading https://files.pythonhosted.org/packages/1a/22/18d108180fb6d9408a7c7d3c47e1a7c7a4e0d348420be27faa9a22f57117/gensim-3.7.1-cp37-cp37m-manylinux1_x86_64.whl (24.2MB)
[K     |████████████████████████████████| 24.2MB 1.4MB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-3.7.1


# Import

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import pandas as pd
import gensim
import seaborn as sns
from matplotlib import pyplot as plt
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import gensim
from gensim import corpora
from gensim.models.wrappers.ldamallet import LdaMallet, malletmodel2ldamodel
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
import re
import plotly.express as px
import plotly.graph_objects as go
from wordcloud import WordCloud
from sklearn.cluster import DBSCAN
import numpy as np
import pickle
import warnings

nltk.download('stopwords')
nltk.download('wordnet')
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


# Initialization

In [None]:
PATH = '/content/gdrive/MyDrive/Dataset-yahoo-answer/'

# Load model

In [None]:
mallet_lda_model = pd.read_pickle(PATH+'/lda/saved/mallet_lda.model')
gensim_model = pd.read_pickle(PATH+'/lda/saved/gensim.model')
corpus = pd.read_pickle(PATH+'/lda/saved/courpus.corp')
dictionary = pd.read_pickle(PATH+'/lda/saved/dictionary.dict')
topic_name = pd.read_pickle(PATH+'/lda/saved/dictionary_topic_name.dict')

# Preprocessing

In [None]:
def preprocess_tokenize(s):
    '''
    This function takes a string as an input and then preprocess the string.
    The preprocessing involves  
        1. removing html tags
        2. removing hyperlinks 
        3. removing \n
        4. making all letters lower-case
        5. removing all punctuations, special characters and digits
        6. tokenization 
        7. lemmatization
    Inputs:
        s: s is a string
    returns:
        tokenized version of input s. s is a list
    '''
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
    
    cleanr = re.compile('<.*?>')
    s = re.sub(cleanr, '', s)      #removing html tags         
    s = re.sub(r"http\S+", "", s)  #removing hyper link
    s = re.sub(r"www\S+", "", s)   #removing hyper link
    s = re.sub(r"\\n", "", s)      #removing \n 
    s = s.lower()  
    s = tokenizer.tokenize(s)   #tokenizing sentence
    
    processed_string = []
    for word in s:
        if word not in stop_words:                  #removing stopwords
            word = lemmatizer.lemmatize(word, 'v')  #converting a word to verb
            word = lemmatizer.lemmatize(word, 'n')  #converting a word to noun
            word = lemmatizer.lemmatize(word, 'a')  #converting a word to adjective

            processed_string.append(word)
            
    return processed_string

# Topic inference

In [None]:
def infer_topics(document, dictionary, model):
  '''
  This function takes document and dictionary then returns 
  document-topic distribution 
  Inputs:
      document: New document text
      dictionary: Lda dictionary
      model: pre-trained gensim model
  returns:
      topic vectors
  '''

  processed_text = preprocess_tokenize(document)
  bow = [dictionary.doc2bow(processed_text)]
  vector = gensim_model[bow]

  return vector

In [None]:
def show_topic_distribution(vector, topic_name, threshold):
  '''
  This function plots pie chart to show top topic distribution on a document 

  Inputs:
      vector: document-topic distribution
      topic_name: topic name dictionary

  returns:
      None
  displays:
      pie chart
  '''

  topic_label = []
  prob_val = []
  vector = sorted(vector[0], key = lambda x: x[1], reverse=True) 
  for i in vector:
    if(i[1]< threshold):
      break
    topic_label.append(topic_name[i[0]])
    prob_val.append(i[1])

  labels = topic_label
  values = prob_val

  # Use `hole` to create a donut-like pie chart
  fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
  fig.show()


# Test

In [None]:
text = r'''
The Vancouver Canucks say 25 members of the organization, including 21 players, have tested positive for the coronavirus. In a statement attributed to team doctor Jim Bovard and infectious disease doctor Josh Douglas, the Canucks said the COVID-19 cases stem from a variant of the virus that was traced by Vancouver Coastal Health back to one individual contracting it in a public setting. Four staff members also tested positive for the virus, another player is considered a close contact, and the entire team remains in quarantine. The team was shut down last week, and it's uncertain when it will play its next game. NHL deputy commissioner Bill Daly said in an email to The Canadian Press earlier this week that the league believes the Canucks will return and conclude their 56-game schedule. The Canucks had a league-high 18 players on the COVID protocol list Tuesday, the second most all season behind only the New Jersey Devils' 19 on Feb. 8 and 9.
 '''

In [None]:
vector = infer_topics(text, dictionary, gensim_model)

In [None]:
show_topic_distribution(vector, topic_name, 0.1)