In [13]:
import pandas as pd
import numpy as np
from etl import text_from_dir
from pretrained_summarization import get_summary
import nltk
import re
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
#from sklearn.decomposition import LatentDirichletAllocation
import gensim
from gensim.models.coherencemodel import CoherenceModel
#from nltk.stem import *
from gensim import corpora
from timeit import default_timer as timer

In [14]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\OkeV\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\OkeV\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\OkeV\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
def preprocess_topic_text(final_data):
    lemmatize = WordNetLemmatizer()
    cnt_vec = CountVectorizer(stop_words = 'english')
    out_dict = dict()
    for key, value in final_data.items():
        result=[]
        for token in gensim.utils.simple_preprocess(value):
            if token not in gensim.parsing.preprocessing.STOPWORDS:
                result.append(token)
        tokens = [[lemmatize.lemmatize(word) for word in result]]
        #tokens = cnt_vec.fit_transform(tokens)
        out_dict[key] = tokens
    return out_dict

In [16]:
#Try num_topics = [5:9]
def topic_model(out_dict):
    final_dict = dict()
    for keys, value in out_dict.items():
        dictionary = gensim.corpora.Dictionary(value)
        bow = [dictionary.doc2bow(doc) for doc in value]
        start = timer()
        lda_model = gensim.models.ldamodel.LdaModel(corpus = bow,
                                           id2word = dictionary,
                                           num_topics = 1, 
                                           random_state = 100,
                                           update_every = 1,
                                           chunksize = 150,
                                           passes = 10,
                                           alpha = 'auto',
                                           per_word_topics = True)
        end = timer()
        final_dict[keys] = lda_model.print_topics()
        coherence_model_lda = CoherenceModel(model=lda_model, texts = value, dictionary= dictionary, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print(f'Processing topics for {keys}')
        print(f'Topics extracted in {end-start:.2f} seconds')
        print(f'Coherence Score for {keys}: {coherence_lda:.2f}')
        print()
 
    return final_dict

In [17]:
def display_results(topics):
    out_dict = dict()
    for indx, values in enumerate(topics.items()):
        for result in values[1]:
            text_score = re.sub(r'[^A-Za-z0-9.]', ' ', result[1])
            only_text = re.sub(r'[^A-Za-z]', ' ', text_score)
            only_scores = re.sub(r'[^0-9.]', ' ', text_score)
            text_tokens = word_tokenize(only_text)
            #score_tokens = word_tokenize(only_scores)
            #combined_list = zip(text_tokens,score_tokens)
            out_dict[values[0]] = list(text_tokens)
           
    return out_dict

In [18]:
"""def topics_to_df(out_dict):
    temp = []
    for indx, values in enumerate(out_dict.items()):
        locals()["final_df_" +str(indx)] = pd.DataFrame(values[1], columns = ['key_word','score'])
        locals()["final_df_" +str(indx)] ['document_name'] = values[0]
        locals()["final_df_" +str(indx)] = locals()["final_df_" +str(indx)][['document_name','key_word','score']]
        temp.append(locals()["final_df_" +str(indx)])
        data = pd.concat(temp)
    data.reset_index(drop= "index" , inplace= True)    
    return data"""

'def topics_to_df(out_dict):\n    temp = []\n    for indx, values in enumerate(out_dict.items()):\n        locals()["final_df_" +str(indx)] = pd.DataFrame(values[1], columns = [\'key_word\',\'score\'])\n        locals()["final_df_" +str(indx)] [\'document_name\'] = values[0]\n        locals()["final_df_" +str(indx)] = locals()["final_df_" +str(indx)][[\'document_name\',\'key_word\',\'score\']]\n        temp.append(locals()["final_df_" +str(indx)])\n        data = pd.concat(temp)\n    data.reset_index(drop= "index" , inplace= True)    \n    return data'

In [19]:
def get_topics(final_data):
    out_dict = preprocess_topic_text(final_data)
    topics = topic_model(out_dict)
    final_topics = display_results(topics)
    #topics_df = topics_to_df(final_topics)
    return final_topics, topics

In [20]:
input_folder = 'C:/Users/OkeV/Documents/GitHub/nlp-exploration-notebooks/text_summarization'
data_cleaning = True

final_data = text_from_dir(input_folder, data_cleaning)
final_topics, topics = get_topics(final_data)

Processing topics for 2020-11-16 GSA USAB meeting notes.docx
Topics extracted in 0.02 seconds
Coherence Score for 2020-11-16 GSA USAB meeting notes.docx: 0.34

Processing topics for 2021-08-23 Doris Paquin (Spectrum) meeting notes.docx
Topics extracted in 0.01 seconds
Coherence Score for 2021-08-23 Doris Paquin (Spectrum) meeting notes.docx: 0.23

Processing topics for ebidm-dsai meeting notes 2022-01-21.docx
Topics extracted in 0.02 seconds
Coherence Score for ebidm-dsai meeting notes 2022-01-21.docx: 0.25

Processing topics for ICT-ACR meeting notes 2020-11-12.docx
Topics extracted in 0.01 seconds
Coherence Score for ICT-ACR meeting notes 2020-11-12.docx: 0.25

Processing topics for 2021-10-15 ESD assumptions meeting notes.txt
Topics extracted in 0.01 seconds
Coherence Score for 2021-10-15 ESD assumptions meeting notes.txt: 0.32

Processing topics for 2022-06-15 Onyx demo from Curtis ONeil.txt
Topics extracted in 0.01 seconds
Coherence Score for 2022-06-15 Onyx demo from Curtis ONeil

#### Self note: 
- The keywords generated by the topic model can be merged with the text summarization summaries.

In [21]:
final_topics

{'2020-11-16 GSA USAB meeting notes.docx': ['work',
  'john',
  'accessibility',
  'acr',
  'format',
  'testing',
  'risk',
  'vpat',
  'acrs',
  'standard'],
 '2021-08-23 Doris Paquin (Spectrum) meeting notes.docx': ['device',
  'spectrum',
  'data',
  'doris',
  'ecd',
  'server',
  'topology',
  'alert',
  'ticket',
  'switch'],
 'ebidm-dsai meeting notes 2022-01-21.docx': ['br',
  'service',
  'time',
  'model',
  'loe',
  'phase',
  'data',
  'sastry',
  'value',
  'date'],
 'ICT-ACR meeting notes 2020-11-12.docx': ['report',
  'issue',
  'risk',
  'page',
  'format',
  'sc',
  'common',
  'need',
  'developer',
  'text'],
 '2021-10-15 ESD assumptions meeting notes.txt': ['ticket',
  'service',
  'hour',
  'desk',
  'incident',
  'time',
  'ecd',
  'email',
  'sr',
  'desc'],
 '2022-06-15 Onyx demo from Curtis ONeil.txt': ['template',
  'partner',
  'email',
  'agent',
  'ticket',
  'incident',
  'create',
  'fill',
  'description',
  'ci']}

In [None]:
#topics_df