# Topic Modeling

In [1]:
import pandas as pd
from gensim import corpora, models, similarities
from pprint import pprint
import numpy as np

In [2]:
data = pd.read_csv('data_science_jobs_USA.csv')

In [3]:
data.head(2)

Unnamed: 0,company,date,job_description,job_title,job_url,location,salary
0,Workplace Alaska,10 days ago,This individual position is EXEMPT from the hi...,Research Analyst III,http://www.indeed.com/rc/clk?jk=42399517a00f67...,"Juneau, AK","$5,017 a month"
1,Lili‘uokalani Trust,30+ days ago,"Job Title: Manager, Data Science Reports to: D...","Manager, Data Science",http://www.indeed.com/rc/clk?jk=bd079f6b150eb0...,"Honolulu, HI",


In [4]:
# Removing jobs that have no job description

data = data[data.job_description.notnull()]

In [5]:
documents = data.job_description

### Plotting documents in 2D using SVD (sklearn)

In [6]:
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()

In [7]:
# Plotting documents

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=1, max_df=0.5, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\\\\-][a-zA-Z\\\\-]{2,}')
data_vectorized = vectorizer.fit_transform(documents)


svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(data_vectorized)
 
df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(documents))
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=800, plot_height=800)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

In [8]:
# Plotting words

svd = TruncatedSVD(n_components=2)
words_2d = svd.fit_transform(data_vectorized.T)
 
df = pd.DataFrame(columns=['x', 'y', 'word'])
df['x'], df['y'], df['word'] = words_2d[:,0], words_2d[:,1], vectorizer.get_feature_names()
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="word", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=800, plot_height=800)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8, color="plum")
plot.add_layout(labels)
show(plot, notebook_handle=True)

### Data preprocesing (gensim)

In [9]:
from gensim.parsing.preprocessing import remove_stopwords
from nltk.corpus import stopwords
import re

# Removing stopwords
# Removing words that are less than 3 letters long
# Removing numbers and punctuation
# Tokenizing the documents


stoplist = set('for and'.split())
texts = [[word for word in remove_stopwords(re.sub(r"[,.;@#?!&$/()*_'’:]+|[0-9]+", " ", document)).lower().split() 
          if word not in stoplist
          and len(word)>2] 
         for document in documents]

# Removing words that appear only once

from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1]
         for text in texts]

pprint(texts[0][:10])

['this',
 'individual',
 'position',
 'exempt',
 'hiring',
 'restrictions',
 'qualified',
 'applicants',
 'encouraged',
 'apply']


In [10]:
# Building a dictionary - linking words to numeric ids

dictionary = corpora.Dictionary(texts)
dictionary.save("dictionary") # Saving the dictionary to get reprobucible results

dictionary = corpora.Dictionary.load("dictionary")
print(dictionary)

Dictionary(47666 unique tokens: ['abilities', 'ability', 'able', 'academic', 'accommodation']...)


In [11]:
# Building a corpus - transforming the collection of texts to a numerical form

corpus = [dictionary.doc2bow(text) for text in texts]
print(corpus[0][:10])

[(0, 3), (1, 2), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]


### Building models (gensim)

#### LSI

In [12]:
nt= 7

In [13]:
lsi_model = models.LsiModel(corpus=corpus, num_topics=nt, id2word=dictionary)
    
for idx in range(nt):
    # Print the first 10 most representative words in each topic
    print("Topic #%s:" % (idx+1), lsi_model.print_topic(idx, 10))
    print("-"*10)

Topic #1: 0.648*"data" + 0.319*"experience" + 0.158*"business" + 0.154*"work" + 0.135*"team" + 0.109*"skills" + 0.106*"science" + 0.098*"years" + 0.095*"analytics" + 0.093*"learning"
----------
Topic #2: -0.681*"data" + 0.263*"research" + 0.222*"experience" + 0.148*"work" + 0.132*"learning" + 0.105*"required" + 0.104*"development" + 0.103*"skills" + 0.100*"machine" + 0.093*"position"
----------
Topic #3: -0.418*"learning" + -0.343*"machine" + 0.253*"research" + -0.228*"experience" + -0.199*"you" + 0.145*"analysis" + 0.131*"information" + 0.129*"required" + 0.123*"ability" + 0.120*"data"
----------
Topic #4: 0.425*"experience" + -0.309*"learning" + -0.253*"business" + -0.252*"research" + -0.237*"machine" + 0.171*"systems" + -0.168*"analytics" + 0.141*"software" + -0.138*"science" + 0.123*"years"
----------
Topic #5: -0.445*"business" + 0.352*"research" + 0.248*"learning" + 0.201*"machine" + 0.181*"data" + -0.180*"team" + -0.158*"analytics" + -0.138*"solutions" + 0.135*"experience" + -0.

#### LDA

In [14]:
lda_model = models.LdaModel(corpus=corpus, num_topics=nt, id2word=dictionary, 
                           random_state=np.random.RandomState(1))
    
for idx in range(nt):
    # Print the first 10 most representative words in each topic
    print("Topic #%s:" % (idx+1), lda_model.print_topic(idx, 10))
    print("-"*10)

Topic #1: 0.022*"data" + 0.013*"research" + 0.011*"experience" + 0.011*"business" + 0.010*"analysis" + 0.008*"skills" + 0.007*"work" + 0.007*"ability" + 0.007*"management" + 0.006*"team"
----------
Topic #2: 0.015*"experience" + 0.012*"data" + 0.010*"systems" + 0.009*"technical" + 0.008*"development" + 0.008*"work" + 0.008*"software" + 0.006*"support" + 0.006*"engineering" + 0.006*"design"
----------
Topic #3: 0.011*"research" + 0.011*"experience" + 0.010*"data" + 0.009*"information" + 0.008*"required" + 0.007*"position" + 0.007*"work" + 0.007*"the" + 0.006*"skills" + 0.005*"analysis"
----------
Topic #4: 0.057*"data" + 0.025*"experience" + 0.009*"business" + 0.008*"team" + 0.008*"science" + 0.007*"work" + 0.007*"analytics" + 0.007*"skills" + 0.007*"years" + 0.007*"solutions"
----------
Topic #5: 0.026*"data" + 0.013*"status" + 0.011*"experience" + 0.010*"work" + 0.009*"business" + 0.008*"analytics" + 0.007*"science" + 0.006*"team" + 0.006*"ability" + 0.006*"learning"
----------
Topic 

In [15]:
import pyLDAvis.gensim

vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

KeyboardInterrupt: 

#### HDP 

In [None]:
hdp_model = models.HdpModel(corpus=corpus, id2word=dictionary, 
                           random_state=np.random.RandomState(1))

#hdp_model.show_topics()
topic_info = hdp_model.print_topics(num_topics = -1, num_words=10)
print("Total number of topics detected: " + str(len(topic_info)))


print("10 most significant topics:")
for idx in range(10):
    # Print the first 10 most representative words in each topic
    print("Topic #%s:" % (idx+1), hdp_model.print_topic(idx, 10))
    print("-"*10)

#### LSI + TfIdf

In [None]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [None]:
lsi_tfidf_model = models.LsiModel(corpus=corpus_tfidf, num_topics=nt, id2word=dictionary)    
    
for idx in range(nt):
    # Print the first 10 most representative words in each topic
    print("Topic #%s:" % (idx+1), lsi_tfidf_model.print_topic(idx, 10))
    print("-"*10)

#### LDA + TfIdf

In [None]:
lda_tfidf_model = models.LdaModel(corpus=corpus_tfidf, num_topics=nt, id2word=dictionary, 
                                  random_state=np.random.RandomState(1))
    
for idx in range(nt):
    # Print the first 10 most representative words in each topic
    print("Topic #%s:" % (idx+1), lda_tfidf_model.print_topic(idx, 10))
    print("-"*10)

In [None]:
vis = pyLDAvis.gensim.prepare(lda_tfidf_model, corpus_tfidf, dictionary)
pyLDAvis.display(vis)

#### HDP + TfIdf

In [None]:
hdp_tfidf_model = models.HdpModel(corpus=corpus_tfidf, id2word=dictionary, 
                                 random_state=np.random.RandomState(1))

#hdp_model.show_topics()
topic_info = hdp_tfidf_model.print_topics(num_topics = -1, num_words=10)
print("Total number of topics detected: " + str(len(topic_info)))


print("10 most significant topics:")
for idx in range(10):
    # Print the first 10 most representative words in each topic
    print("Topic #%s:" % (idx+1), hdp_tfidf_model.print_topic(idx, 10))
    print("-"*10)

#### LSI + Bigrams

In [None]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser

bigram = Phrases(texts, min_count=1, threshold=1, delimiter=b' ')
bigram_phraser = Phraser(bigram)

texts_b = [bigram_phraser[text] for text in texts]
pprint(texts_b[0][:10])

dictionary_b = corpora.Dictionary(texts_b)
dictionary_b.save("dictionary_b") # Saving the dictionary to get reprobucable results
dictionary_b = corpora.Dictionary.load("dictionary_b")

corpus_b = [dictionary_b.doc2bow(text) for text in texts_b]

In [None]:
lsi_model_b = models.LsiModel(corpus=corpus_b, num_topics=nt, id2word=dictionary_b)
    
for idx in range(nt):
    # Print the first 10 most representative words in each topic
    print("Topic #%s:" % (idx+1), lsi_model_b.print_topic(idx, 10))
    print("-"*10)

#### LDA + Bigrams

In [None]:
lda_model_b = models.LdaModel(corpus=corpus_b, num_topics=nt, id2word=dictionary_b, 
                              random_state=np.random.RandomState(1))
    
for idx in range(nt):
    # Print the first 10 most representative words in each topic
    print("Topic #%s:" % (idx+1), lda_model_b.print_topic(idx, 10))
    print("-"*10)

In [None]:
import pyLDAvis.gensim

vis = pyLDAvis.gensim.prepare(lda_model_b, corpus_b, dictionary_b)
pyLDAvis.display(vis)

#### HDP + Bigrams

In [None]:
hdp_model_b = models.HdpModel(corpus=corpus_b, id2word=dictionary_b, 
                             random_state=np.random.RandomState(1))

#hdp_model.show_topics()
topic_info = hdp_model_b.print_topics(num_topics = -1, num_words=10)
print("Total number of topics detected: " + str(len(topic_info)))


print("10 most significant topics:")
for idx in range(10):
    # Print the first 10 most representative words in each topic
    print("Topic #%s:" % (idx+1), hdp_model_b.print_topic(idx, 10))
    print("-"*10)

### Testing the results with simiarity queries

#### Defining the query document

In [None]:
# Selecting a random document for query and testing the recommendations 

job_ad = data.sample(n=1, random_state = 1)

print('SELECTED DOCUMENT')
print(job_ad.company.values[0])
print(job_ad.job_title.values[0])
search = job_ad.job_description.values[0]
pprint(search)

search = [word for word in remove_stopwords(re.sub(r"[,.;@#?!&$/()*_'’:]+|[0-9]+", " ", search)).lower().split()] 

#### Building the query

In [None]:
from gensim import similarities

def query(model, corpus, search_text):
    index = similarities.MatrixSimilarity(model[corpus])
 
    # Performing a query and sorting results

    similar = index[model[search_text]]
    similar = sorted(enumerate(similar), key=lambda item: -item[1])
 
    # Showing most similar documents' numbers and similarity 
    print('Most similar documents')
    print(similar[:6])

    # Showing most similar documents (skipping the first 1 that is our sampled document)
    document_ids = [x[0] for x in similar[1:6]]

    pd.options.display.max_colwidth = 400
    return data.iloc[document_ids, [0, 3, 2]]

#### Testing the query with different models

In [None]:
# LSI

import warnings
warnings.filterwarnings('ignore')

bow = dictionary.doc2bow(search)

print('LSI topic distribution')
print(lsi_model[bow])

query(lsi_model, corpus, bow)

In [None]:
# LDA

print('LDA topic distribution')
print(lda_model[bow])

query(lda_model, corpus, bow)

In [None]:
# HDP

print('HDP topic distribution')
print(hdp_model[bow])

query(hdp_model, corpus, bow)

In [None]:
# LSI + TfIdf

bow_tfidf = tfidf[bow]

print('LSI TfIdf topic distribution')
print(lsi_tfidf_model[bow_tfidf])

query(lsi_tfidf_model, corpus_tfidf, bow_tfidf)

In [None]:
# LDA + TfIdf

print('LDA TfIdf topic distribution')
print(lda_tfidf_model[bow_tfidf])

query(lda_tfidf_model, corpus_tfidf, bow_tfidf)

In [None]:
# HDP + TfIdf

print('HDP TfIdf topic distribution')
print(hdp_tfidf_model[bow_tfidf])

query(hdp_tfidf_model, corpus_tfidf, bow_tfidf)

In [None]:
# LSI + Bigrams

bow_b = dictionary_b.doc2bow(search)

print('LSI(Bigrams) topic distribution')
print(lsi_model_b[bow_b])

query(lsi_model_b, corpus_b, bow_b)

In [None]:
# LDA + Bigrams

print('LSI topic distribution')
print(lda_model_b[bow_b])

query(lda_model_b, corpus_b, bow_b)

In [None]:
# LDA + Bigrams

print('LDA (Bigrams) topic distribution')
print(lda_model_b[bow_b])

query(lda_model_b, corpus_b, bow_b)

In [None]:
# HDP + Bigrams

print('HDP (Bigrams) topic distribution')
print(hdp_model_b[bow_b])

query(hdp_model_b, corpus_b, bow_b)