# TM Extract Keywords

- Date: 2019/08/30
- Author: Daniel Hu (University of Melbourne)
- Description: Assist the researchers coding the topics by highlighting the keywords of topics in each document.

# Step 1: Import Libraries and Collect Data
## (1) Import Packages and Prepare Stopwords

In [1]:
# Run in python console
import nltk
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
#nltk.download('stopwords')

import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)



# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

## (2) Import Data

In [2]:
# Import Dataset
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
df.head()

Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


# Step 2: Preprocessing
## (1) Remove Noisy Characters & Tokenize Words

In [3]:
def datafile_to_list(data_file):
    # Convert to list
    data_list = data_file.content.values.tolist()
    return data_list
    
def remove_noise(data):
    # Remove Emails
    data = [re.sub('\S*@\S*\s?', '', doc) for doc in data]

    # Remove new line characters
    data = [re.sub('\s+', ' ', doc) for doc in data]

    # Remove distracting single quotes
    data = [re.sub("\'", "", doc) for doc in data]
    
    return data

def doc_to_words(docs):
    for doc in docs:
        yield(gensim.utils.simple_preprocess(str(doc), deacc=True))

data_list = datafile_to_list(df)
data = remove_noise(data_list)
data_words = list(doc_to_words(data))

## (2) Remove Stopwords, Make Bigrams and Lemmatize

In [4]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

## (3) Create the Dictionary and Corpus needed for Topic Modeling

In [5]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# Step 3: Building Topic Model
## (1) Build the LDA Topic Model & View the Topics

In [6]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

pprint(lda_model.print_topics())

[(0,
  '0.141*"team" + 0.140*"game" + 0.081*"play" + 0.080*"sale" + 0.033*"nhl" + '
  '0.029*"trade" + 0.029*"cd" + 0.018*"ice" + 0.014*"detroit" + 0.014*"joe"'),
 (1,
  '0.086*"pin" + 0.044*"processor" + 0.044*"character" + 0.040*"font" + '
  '0.034*"mirror" + 0.018*"radius" + 0.018*"quran" + 0.017*"stephen" + '
  '0.014*"ford" + 0.012*"alot"'),
 (2,
  '0.048*"notice" + 0.040*"material" + 0.037*"signal" + 0.037*"external" + '
  '0.030*"circuit" + 0.022*"case_western" + 0.022*"reserve_university" + '
  '0.021*"oil" + 0.018*"charle" + 0.016*"william"'),
 (3,
  '0.054*"not" + 0.034*"do" + 0.028*"would" + 0.026*"be" + 0.021*"say" + '
  '0.020*"think" + 0.019*"know" + 0.017*"go" + 0.016*"people" + 0.015*"get"'),
 (4,
  '0.084*"library" + 0.061*"object" + 0.045*"cub" + 0.011*"static" + '
  '0.008*"compiler" + 0.008*"void" + 0.006*"borland" + 0.006*"bc" + '
  '0.003*"sps" + 0.001*"initialize"'),
 (5,
  '0.112*"israel" + 0.061*"israeli" + 0.045*"jew" + 0.038*"arab" + '
  '0.037*"jewish" + 0.0

## (2) Compute Model Perplexity and Coherence Score

In [7]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -14.150704488362106

Coherence Score:  0.4993186780826996


# Step 4: Simplify Analysis

## Decide Topic and Document to Analysis

In [8]:
doc_number = 11000

## Preprocessing
### (1) Data File to List & Remove Noises

In [9]:
doc_data = datafile_to_list(df)[doc_number]
doc = remove_noise([doc_data])
pprint(doc)

['From: (Brian Yamauchi) Subject: DC-X: Choice of a New Generation (was Re: '
 'SSRT Roll-Out Speech) Organization: Case Western Reserve University Lines: '
 '27 Distribution: world NNTP-Posting-Host: yuggoth.ces.cwru.edu In-reply-to: '
 'message of 21 Apr 1993 22:09:32 -0400 In article (Jordan Katz) writes: > '
 'Speech Delivered by Col. Simon P. Worden, > The Deputy for Technology, SDIO '
 '> > Most of you, as am I, are "children of the 1960s." We grew >up in an age '
 'of miracles -- Inter-Continental Ballistic Missiles, >nuclear energy, '
 'computers, flights to the moon. But these were >miracles of our parents '
 'doing. > Speech by Pete Worden > Delivered Before the U.S. Space Foundation '
 'Conference > Im embarrassed when my generation is compared with the last '
 '>generation -- the giants of the last great space era, the 1950s >and 1960s. '
 'They went to the moon - we built a telescope that >cant see straight. They '
 'soft-landed on Mars - the least we >could do is soft-lan

### (2) Split Sentences

In [10]:
# Generate the Sentence List
sent_list = sent_tokenize(doc[0])
pprint(sent_list)

['From: (Brian Yamauchi) Subject: DC-X: Choice of a New Generation (was Re: '
 'SSRT Roll-Out Speech) Organization: Case Western Reserve University Lines: '
 '27 Distribution: world NNTP-Posting-Host: yuggoth.ces.cwru.edu In-reply-to: '
 'message of 21 Apr 1993 22:09:32 -0400 In article (Jordan Katz) writes: > '
 'Speech Delivered by Col. Simon P. Worden, > The Deputy for Technology, SDIO '
 '> > Most of you, as am I, are "children of the 1960s."',
 'We grew >up in an age of miracles -- Inter-Continental Ballistic Missiles, '
 '>nuclear energy, computers, flights to the moon.',
 'But these were >miracles of our parents doing.',
 '> Speech by Pete Worden > Delivered Before the U.S. Space Foundation '
 'Conference > Im embarrassed when my generation is compared with the last '
 '>generation -- the giants of the last great space era, the 1950s >and 1960s.',
 'They went to the moon - we built a telescope that >cant see straight.',
 'They soft-landed on Mars - the least we >could do is soft

### (3) Generate Bigram List by Tokenized Words

In [11]:
# Tokenize the doc to tokens(words)
doc_words = list(doc_to_words(doc))

# Remove Stop Words
doc_words_nostops = remove_stopwords(doc_words)

# Form Bigrams
doc_words_bigrams = make_bigrams(doc_words_nostops)

# Generate Bigram List for this document
bigram_list = [bigram for bigram in doc_words_bigrams[0] if '_' in bigram]
pprint(bigram_list)

['brian_yamauchi',
 'case_western',
 'reserve_university',
 'distribution_world',
 'nntp_posting',
 'brian_yamauchi',
 'case_western',
 'reserve_university']


### (4) Check Topic Distribution for This Doc

In [12]:
print("\nThe topic distribution of this document is:")
topic_list = sorted(lda_model.get_document_topics(corpus[doc_number],minimum_probability=0.0), key=lambda x: (x[1]), reverse=True)

for i in range(20):
    print("Topic No.", topic_list[i][0], "\t", "%.2f" % topic_list[i][1])


The topic distribution of this document is:
Topic No. 17 	 0.22
Topic No. 14 	 0.14
Topic No. 18 	 0.14
Topic No. 3 	 0.13
Topic No. 9 	 0.12
Topic No. 10 	 0.11
Topic No. 2 	 0.05
Topic No. 16 	 0.03
Topic No. 8 	 0.01
Topic No. 13 	 0.01
Topic No. 6 	 0.01
Topic No. 0 	 0.00
Topic No. 11 	 0.00
Topic No. 15 	 0.00
Topic No. 5 	 0.00
Topic No. 19 	 0.00
Topic No. 12 	 0.00
Topic No. 1 	 0.00
Topic No. 7 	 0.00
Topic No. 4 	 0.00


### (5) Decide Topic, Generate Keyword List & Highlight the Keywords

In [14]:
def gen_keyword_list(topic_number):
    keywords = [word for word, prop in lda_model.show_topic(topic_number, topn=20)] # keyword list of topic j
    return keywords

topic_number = 17

keyword_list = gen_keyword_list(topic_number)
print("Keywords of topic", topic_number)
pprint(keyword_list)

highlight_list = []
for i, sent in enumerate(sent_list):
    bigram_flag = False
    tokens = gensim.utils.simple_preprocess(str(sent), deacc=False)
    for j, token in enumerate(tokens):
        # Skip the stop_words
        if token in stop_words:
            continue
        
        # Check Bigram
        if bigram_flag == True:
            to_bigram = tokens[j-1] + '_' + token
            if to_bigram in bigram_list:
                if to_bigram in keyword_list and to_bigram not in highlight_list:
                    highlight_list.append(tokens[j-1])
                    highlight_list.append(token)
        
        bigram_flag = False
        if any(token in bigram for bigram in bigram_list):
            bigram_flag = True

        lemma_list = lemmatization([[token]], allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

        for lemma in lemma_list[0]:
            if lemma in keyword_list and lemma not in highlight_list:
                highlight_list.append(token)
    
    this_sent = "\033[1;30mSentence " + str(i+1)
    for word in word_tokenize(sent):
        if word.lower() in highlight_list:
            this_sent += ' ' + "\033[1;31m" + word
        else:
            this_sent += ' ' + "\033[0m" + word
    print(this_sent)

Keywords of topic 17
['line',
 'organization',
 'write',
 'article',
 'university',
 'host',
 'reply',
 'nntp_poste',
 'thank',
 'nntp_posting',
 'get',
 'anyone',
 'post',
 'mail',
 'help',
 'win',
 'good',
 'look',
 'distribution_world',
 'news']
[1;30mSentence 1 [0mFrom [0m: [0m( [0mBrian [0mYamauchi [0m) [0mSubject [0m: [0mDC-X [0m: [0mChoice [0mof [0ma [0mNew [0mGeneration [0m( [0mwas [0mRe [0m: [0mSSRT [0mRoll-Out [0mSpeech [0m) [1;31mOrganization [0m: [0mCase [0mWestern [0mReserve [1;31mUniversity [1;31mLines [0m: [0m27 [1;31mDistribution [0m: [1;31mworld [0mNNTP-Posting-Host [0m: [0myuggoth.ces.cwru.edu [0mIn-reply-to [0m: [0mmessage [0mof [0m21 [0mApr [0m1993 [0m22:09:32 [0m-0400 [0mIn [1;31marticle [0m( [0mJordan [0mKatz [0m) [1;31mwrites [0m: [0m> [0mSpeech [0mDelivered [0mby [0mCol. [0mSimon [0mP. [0mWorden [0m, [0m> [0mThe [0mDeputy [0mfor [0mTechnology [0m, [0mSDIO [0m> [0m> [0mMost [0mof [0myou 