# Topic Modeling

In [1]:
import pandas as pd
from gensim import corpora, models, similarities
from pprint import pprint
import numpy as np

In [2]:
data = pd.read_csv('data_science_jobs_USA.csv')

In [3]:
data.head(2)

Unnamed: 0,company,date,job_description,job_title,job_url,location,salary
0,Workplace Alaska,10 days ago,This individual position is EXEMPT from the hi...,Research Analyst III,http://www.indeed.com/rc/clk?jk=42399517a00f67...,"Juneau, AK","$5,017 a month"
1,Lili‘uokalani Trust,30+ days ago,"Job Title: Manager, Data Science Reports to: D...","Manager, Data Science",http://www.indeed.com/rc/clk?jk=bd079f6b150eb0...,"Honolulu, HI",


In [4]:
# Removing jobs that have no job description

data = data[data.job_description.notnull()]

In [5]:
documents = data.job_description

### Plotting documents in 2D using SVD (sklearn)

In [6]:
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()

In [7]:
# Plotting documents

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=1, max_df=0.5, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\\\\-][a-zA-Z\\\\-]{2,}')
data_vectorized = vectorizer.fit_transform(documents)


svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(data_vectorized)
 
df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(documents))
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=800, plot_height=800)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

In [8]:
# Plotting words

svd = TruncatedSVD(n_components=2)
words_2d = svd.fit_transform(data_vectorized.T)
 
df = pd.DataFrame(columns=['x', 'y', 'word'])
df['x'], df['y'], df['word'] = words_2d[:,0], words_2d[:,1], vectorizer.get_feature_names()
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="word", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=800, plot_height=800)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8, color="plum")
plot.add_layout(labels)
show(plot, notebook_handle=True)

### Data preprocesing (gensim)

In [9]:
from gensim.parsing.preprocessing import remove_stopwords
from nltk.corpus import stopwords
import re

# Removing stopwords
# Removing words that are less than 3 letters long
# Removing numbers and punctuation
# Tokenizing the documents

stoplist = set('for and'.split())
texts = [[word for word in remove_stopwords(re.sub(r"[,.;@#?!&$/()*_'’:]+|[0-9]+", " ", document)).lower().split() 
          if word not in stoplist
          and len(word)>2] 
         for document in documents]

# Removing words that appear only once

from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1]
         for text in texts]

pprint(texts[0][:10])

['this',
 'individual',
 'position',
 'exempt',
 'hiring',
 'restrictions',
 'qualified',
 'applicants',
 'encouraged',
 'apply']


In [10]:
# Building a dictionary - linking words to numeric ids

dictionary = corpora.Dictionary(texts)
dictionary.save("dictionary") # Saving the dictionary to get reprobucible results

dictionary = corpora.Dictionary.load("dictionary")
print(dictionary)

Dictionary(47666 unique tokens: ['abilities', 'ability', 'able', 'academic', 'accommodation']...)


In [11]:
# Building a corpus - transforming the collection of texts to a numerical form

corpus = [dictionary.doc2bow(text) for text in texts]
print(corpus[0][:10])

[(0, 3), (1, 2), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]


### Building models (gensim)

#### LSI

In [12]:
nt= 7

In [13]:
lsi_model = models.LsiModel(corpus=corpus, num_topics=nt, id2word=dictionary)
    
for idx in range(nt):
    # Print the first 10 most representative words in each topic
    print("Topic #%s:" % (idx+1), lsi_model.print_topic(idx, 10))
    print("-"*10)

Topic #1: 0.648*"data" + 0.319*"experience" + 0.158*"business" + 0.154*"work" + 0.135*"team" + 0.109*"skills" + 0.106*"science" + 0.098*"years" + 0.095*"analytics" + 0.093*"learning"
----------
Topic #2: -0.681*"data" + 0.263*"research" + 0.222*"experience" + 0.148*"work" + 0.132*"learning" + 0.105*"required" + 0.104*"development" + 0.103*"skills" + 0.100*"machine" + 0.093*"position"
----------
Topic #3: -0.418*"learning" + -0.343*"machine" + 0.253*"research" + -0.228*"experience" + -0.199*"you" + 0.145*"analysis" + 0.131*"information" + 0.129*"required" + 0.123*"ability" + 0.120*"data"
----------
Topic #4: 0.425*"experience" + -0.309*"learning" + -0.253*"business" + -0.252*"research" + -0.237*"machine" + 0.171*"systems" + -0.168*"analytics" + 0.141*"software" + -0.138*"science" + 0.123*"years"
----------
Topic #5: -0.445*"business" + 0.352*"research" + 0.248*"learning" + 0.201*"machine" + 0.181*"data" + -0.180*"team" + -0.158*"analytics" + -0.138*"solutions" + 0.135*"experience" + -0.

#### LDA

In [14]:
lda_model = models.LdaModel(corpus=corpus, num_topics=nt, id2word=dictionary, 
                           random_state=np.random.RandomState(1))
    
for idx in range(nt):
    # Print the first 10 most representative words in each topic
    print("Topic #%s:" % (idx+1), lda_model.print_topic(idx, 10))
    print("-"*10)

Topic #1: 0.022*"data" + 0.013*"research" + 0.011*"experience" + 0.011*"business" + 0.010*"analysis" + 0.008*"skills" + 0.007*"work" + 0.007*"ability" + 0.007*"management" + 0.006*"team"
----------
Topic #2: 0.015*"experience" + 0.012*"data" + 0.010*"systems" + 0.009*"technical" + 0.008*"development" + 0.008*"work" + 0.008*"software" + 0.006*"support" + 0.006*"engineering" + 0.006*"design"
----------
Topic #3: 0.011*"research" + 0.011*"experience" + 0.010*"data" + 0.009*"information" + 0.008*"required" + 0.007*"position" + 0.007*"work" + 0.007*"the" + 0.006*"skills" + 0.005*"analysis"
----------
Topic #4: 0.057*"data" + 0.025*"experience" + 0.009*"business" + 0.008*"team" + 0.008*"science" + 0.007*"work" + 0.007*"analytics" + 0.007*"skills" + 0.007*"years" + 0.007*"solutions"
----------
Topic #5: 0.026*"data" + 0.013*"status" + 0.011*"experience" + 0.010*"work" + 0.009*"business" + 0.008*"analytics" + 0.007*"science" + 0.006*"team" + 0.006*"ability" + 0.006*"learning"
----------
Topic 

In [15]:
import pyLDAvis.gensim

vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

#### HDP 

In [16]:
hdp_model = models.HdpModel(corpus=corpus, id2word=dictionary, 
                           random_state=np.random.RandomState(1))

#hdp_model.show_topics()
topic_info = hdp_model.print_topics(num_topics = -1, num_words=10)
print("Total number of topics detected: " + str(len(topic_info)))


print("10 most significant topics:")
for idx in range(10):
    # Print the first 10 most representative words in each topic
    print("Topic #%s:" % (idx+1), hdp_model.print_topic(idx, 10))
    print("-"*10)

Total number of topics detected: 150
10 most significant topics:
Topic #1: 0.028*"data" + 0.016*"experience" + 0.008*"work" + 0.007*"business" + 0.007*"team" + 0.005*"skills" + 0.005*"learning" + 0.005*"science" + 0.005*"years" + 0.005*"research"
----------
Topic #2: 0.028*"data" + 0.016*"experience" + 0.007*"work" + 0.007*"research" + 0.006*"skills" + 0.006*"team" + 0.006*"business" + 0.005*"analysis" + 0.005*"science" + 0.005*"years"
----------
Topic #3: 0.029*"data" + 0.014*"experience" + 0.009*"work" + 0.007*"ability" + 0.007*"status" + 0.007*"business" + 0.007*"kpmg" + 0.006*"analytics" + 0.006*"years" + 0.006*"team"
----------
Topic #4: 0.023*"data" + 0.014*"experience" + 0.007*"team" + 0.006*"work" + 0.006*"business" + 0.005*"learning" + 0.005*"years" + 0.005*"skills" + 0.004*"you" + 0.004*"machine"
----------
Topic #5: 0.019*"data" + 0.011*"experience" + 0.009*"work" + 0.008*"skills" + 0.008*"ability" + 0.006*"required" + 0.006*"business" + 0.004*"team" + 0.004*"years" + 0.004*

#### LSI + TfIdf

In [17]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [18]:
lsi_tfidf_model = models.LsiModel(corpus=corpus_tfidf, num_topics=nt, id2word=dictionary)    
    
for idx in range(nt):
    # Print the first 10 most representative words in each topic
    print("Topic #%s:" % (idx+1), lsi_tfidf_model.print_topic(idx, 10))
    print("-"*10)

Topic #1: 0.102*"learning" + 0.100*"machine" + 0.097*"you" + 0.090*"research" + 0.087*"kpmg" + 0.085*"analytics" + 0.082*"business" + 0.073*"big" + 0.069*"product" + 0.069*"status"
----------
Topic #2: -0.696*"kpmg" + -0.193*"lighthouse" + -0.151*"status" + -0.082*"creativity" + -0.080*"genetic" + -0.080*"military" + -0.074*"excellence" + -0.074*"firm" + -0.070*"unfavorable" + -0.070*"matriculation"
----------
Topic #3: -0.354*"one" + -0.316*"capital" + -0.166*"succeeding" + -0.151*"banking" + -0.117*"respect" + -0.102*"machine" + -0.100*"you" + -0.098*"kpmg" + -0.095*"card" + -0.090*"learning"
----------
Topic #4: 0.199*"research" + 0.172*"capital" + 0.170*"one" + -0.135*"big" + -0.129*"amazon" + -0.129*"aws" + -0.116*"engineer" + -0.103*"hadoop" + -0.103*"platform" + -0.100*"spark"
----------
Topic #5: -0.258*"learning" + -0.250*"machine" + -0.116*"algorithms" + -0.111*"models" + -0.108*"natural" + -0.108*"language" + -0.101*"deep" + -0.100*"you" + -0.099*"scientist" + 0.094*"aws"
--

#### LDA + TfIdf

In [19]:
lda_tfidf_model = models.LdaModel(corpus=corpus_tfidf, num_topics=nt, id2word=dictionary, 
                                  random_state=np.random.RandomState(1))
    
for idx in range(nt):
    # Print the first 10 most representative words in each topic
    print("Topic #%s:" % (idx+1), lda_tfidf_model.print_topic(idx, 10))
    print("-"*10)

Topic #1: 0.002*"research" + 0.002*"business" + 0.002*"analytics" + 0.002*"statistical" + 0.001*"analysis" + 0.001*"required" + 0.001*"marketing" + 0.001*"management" + 0.001*"support" + 0.001*"analyst"
----------
Topic #2: 0.002*"infosys" + 0.002*"kore" + 0.002*"usinfosys" + 0.002*"cities" + 0.002*"azure" + 0.002*"across" + 0.002*"hvac" + 0.002*"title" + 0.002*"allen" + 0.001*"outsourcing"
----------
Topic #3: 0.004*"mapr" + 0.002*"sinai" + 0.002*"mount" + 0.002*"caci" + 0.001*"liberty" + 0.001*"mailchimp" + 0.001*"optum" + 0.001*"dell" + 0.001*"sigint" + 0.001*"announcement"
----------
Topic #4: 0.003*"learning" + 0.003*"machine" + 0.002*"google" + 0.002*"you" + 0.002*"algorithms" + 0.001*"capgemini" + 0.001*"deep" + 0.001*"models" + 0.001*"scientist" + 0.001*"software"
----------
Topic #5: 0.006*"kpmg" + 0.003*"status" + 0.003*"learning" + 0.003*"machine" + 0.002*"big" + 0.002*"you" + 0.002*"technologies" + 0.002*"lighthouse" + 0.001*"analytics" + 0.001*"cloud"
----------
Topic #6: 

In [20]:
vis = pyLDAvis.gensim.prepare(lda_tfidf_model, corpus_tfidf, dictionary)
pyLDAvis.display(vis)

#### HDP + TfIdf

In [21]:
hdp_tfidf_model = models.HdpModel(corpus=corpus_tfidf, id2word=dictionary, 
                                 random_state=np.random.RandomState(1))

#hdp_model.show_topics()
topic_info = hdp_tfidf_model.print_topics(num_topics = -1, num_words=10)
print("Total number of topics detected: " + str(len(topic_info)))


print("10 most significant topics:")
for idx in range(10):
    # Print the first 10 most representative words in each topic
    print("Topic #%s:" % (idx+1), hdp_tfidf_model.print_topic(idx, 10))
    print("-"*10)

Total number of topics detected: 150
10 most significant topics:
Topic #1: 0.002*"learning" + 0.002*"research" + 0.001*"machine" + 0.001*"you" + 0.001*"business" + 0.001*"analytics" + 0.001*"product" + 0.001*"analysis" + 0.001*"statistical" + 0.001*"software"
----------
Topic #2: 0.001*"learning" + 0.001*"you" + 0.001*"machine" + 0.001*"research" + 0.001*"analytics" + 0.001*"business" + 0.001*"software" + 0.001*"big" + 0.001*"product" + 0.001*"systems"
----------
Topic #3: 0.001*"learning" + 0.001*"you" + 0.001*"machine" + 0.001*"research" + 0.001*"analytics" + 0.001*"business" + 0.001*"big" + 0.001*"software" + 0.001*"product" + 0.001*"solutions"
----------
Topic #4: 0.001*"machine" + 0.001*"learning" + 0.001*"research" + 0.000*"you" + 0.000*"analytics" + 0.000*"business" + 0.000*"statistical" + 0.000*"big" + 0.000*"kpmg" + 0.000*"models"
----------
Topic #5: 0.001*"learning" + 0.001*"research" + 0.001*"machine" + 0.000*"analytics" + 0.000*"you" + 0.000*"business" + 0.000*"big" + 0.00

#### LSI + Bigrams

In [22]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser

bigram = Phrases(texts, min_count=1, threshold=1, delimiter=b' ')
bigram_phraser = Phraser(bigram)

texts_b = [bigram_phraser[text] for text in texts]
pprint(texts_b[0][:10])

dictionary_b = corpora.Dictionary(texts_b)
dictionary_b.save("dictionary_b") # Saving the dictionary to get reprobucable results
dictionary_b = corpora.Dictionary.load("dictionary_b")

corpus_b = [dictionary_b.doc2bow(text) for text in texts_b]

['this individual',
 'position exempt',
 'hiring',
 'restrictions',
 'qualified applicants',
 'encouraged apply',
 'this',
 'recruitment',
 'open',
 'alaska']


In [23]:
lsi_model_b = models.LsiModel(corpus=corpus_b, num_topics=nt, id2word=dictionary_b)
    
for idx in range(nt):
    # Print the first 10 most representative words in each topic
    print("Topic #%s:" % (idx+1), lsi_model_b.print_topic(idx, 10))
    print("-"*10)

Topic #1: 0.602*"data" + 0.423*"experience" + 0.174*"work" + 0.166*"team" + 0.159*"business" + 0.119*"machine learning" + 0.114*"data science" + 0.114*"support" + 0.113*"solutions" + 0.105*"research"
----------
Topic #2: -0.754*"data" + 0.417*"experience" + 0.266*"machine learning" + 0.122*"work" + 0.122*"team" + 0.110*"research" + 0.088*"data science" + 0.073*"development" + 0.067*"business" + 0.067*"systems"
----------
Topic #3: -0.521*"machine learning" + 0.429*"research" + -0.249*"data science" + 0.215*"support" + 0.204*"analysis" + 0.152*"required" + 0.141*"management" + -0.135*"you" + 0.131*"information" + -0.115*"team"
----------
Topic #4: -0.650*"experience" + 0.358*"research" + 0.275*"machine learning" + 0.260*"data science" + 0.241*"business" + 0.195*"team" + -0.125*"big data" + 0.104*"analytics" + 0.102*"analysis" + 0.089*"you"
----------
Topic #5: 0.545*"research" + -0.443*"business" + 0.399*"machine learning" + -0.249*"analytics" + -0.167*"data science" + 0.149*"data" + -0

#### LDA + Bigrams

In [24]:
lda_model_b = models.LdaModel(corpus=corpus_b, num_topics=nt, id2word=dictionary_b, 
                              random_state=np.random.RandomState(1))
    
for idx in range(nt):
    # Print the first 10 most representative words in each topic
    print("Topic #%s:" % (idx+1), lda_model_b.print_topic(idx, 10))
    print("-"*10)

Topic #1: 0.019*"data" + 0.011*"business" + 0.011*"experience" + 0.007*"analytics" + 0.007*"data science" + 0.006*"work" + 0.006*"analysis" + 0.005*"team" + 0.004*"support" + 0.004*"research"
----------
Topic #2: 0.014*"experience" + 0.011*"data" + 0.005*"work" + 0.005*"support" + 0.004*"required" + 0.004*"analysis" + 0.004*"management" + 0.004*"development" + 0.004*"knowledge" + 0.003*"technical"
----------
Topic #3: 0.027*"data" + 0.021*"experience" + 0.008*"team" + 0.007*"big data" + 0.006*"work" + 0.006*"solutions" + 0.005*"development" + 0.004*"systems" + 0.004*"you" + 0.004*"business"
----------
Topic #4: 0.012*"experience" + 0.007*"data" + 0.006*"team" + 0.006*"work" + 0.005*"data center" + 0.005*"systems" + 0.004*"solutions" + 0.004*"support" + 0.004*"design" + 0.004*"business"
----------
Topic #5: 0.010*"experience" + 0.010*"data" + 0.006*"big data" + 0.005*"machine learning" + 0.005*"research" + 0.004*"data science" + 0.004*"years experience" + 0.004*"business" + 0.003*"work"

In [25]:
import pyLDAvis.gensim

vis = pyLDAvis.gensim.prepare(lda_model_b, corpus_b, dictionary_b)
pyLDAvis.display(vis)

#### HDP + Bigrams

In [26]:
hdp_model_b = models.HdpModel(corpus=corpus_b, id2word=dictionary_b, 
                             random_state=np.random.RandomState(1))

#hdp_model.show_topics()
topic_info = hdp_model_b.print_topics(num_topics = -1, num_words=10)
print("Total number of topics detected: " + str(len(topic_info)))


print("10 most significant topics:")
for idx in range(10):
    # Print the first 10 most representative words in each topic
    print("Topic #%s:" % (idx+1), hdp_model_b.print_topic(idx, 10))
    print("-"*10)

Total number of topics detected: 150
10 most significant topics:
Topic #1: 0.017*"data" + 0.015*"experience" + 0.006*"team" + 0.006*"work" + 0.005*"business" + 0.005*"machine learning" + 0.004*"data science" + 0.004*"research" + 0.004*"solutions" + 0.004*"support"
----------
Topic #2: 0.016*"data" + 0.014*"experience" + 0.006*"work" + 0.005*"team" + 0.005*"business" + 0.005*"data science" + 0.005*"machine learning" + 0.004*"research" + 0.004*"support" + 0.004*"analytics"
----------
Topic #3: 0.014*"data" + 0.011*"experience" + 0.006*"work" + 0.005*"team" + 0.005*"business" + 0.004*"machine learning" + 0.004*"the" + 0.004*"skills" + 0.004*"research" + 0.004*"analytics"
----------
Topic #4: 0.015*"data" + 0.014*"experience" + 0.006*"work" + 0.006*"team" + 0.005*"research" + 0.004*"business" + 0.004*"analysis" + 0.004*"skills" + 0.003*"including" + 0.003*"machine learning"
----------
Topic #5: 0.014*"experience" + 0.014*"data" + 0.009*"machine learning" + 0.007*"years experience" + 0.004*

### Testing the results with simiarity queries

#### Defining the query document

In [27]:
# Selecting a random document for query and testing the recommendations 

job_ad = data.sample(n=1, random_state = 1)

print('SELECTED DOCUMENT')
print(job_ad.company.values[0])
print(job_ad.job_title.values[0])
search = job_ad.job_description.values[0]
pprint(search)

search = [word for word in remove_stopwords(re.sub(r"[,.;@#?!&$/()*_'’:]+|[0-9]+", " ", search)).lower().split()] 

SELECTED DOCUMENT
Aetna
Sr Data Engineer
('POSITION SUMMARY\n'
 'The Sr. Data Engineer will be responsible for developing and managing our '
 'big data environment. This individual will work with the data engineering '
 'team and be the thought leader and head guru for our data environment.\n'
 '\n'
 'Fundamental Components:\n'
 'Develops large scale data structures and pipelines to organize, collect and '
 'standardize data that helps generate insights and addresses reporting '
 'needs.\n'
 'Collaborates with other data teams to transform data and integrate '
 'algorithms and models into automated processes.\n'
 'Uses knowledge in Hadoop architecture, HDFS commands and experience '
 'designing & optimizing queries to build data pipelines.\n'
 'Uses strong programming skills in Python, Java or any of the major languages '
 'to build robust data pipelines and dynamic systems.\n'
 'Builds data marts and data models to support Data Science and other internal '
 'customers.\n'
 'Analyzes c

#### Building the query

In [28]:
from gensim import similarities

def query(model, corpus, search_text):
    index = similarities.MatrixSimilarity(model[corpus])
 
    # Performing a query and sorting results

    similar = index[model[search_text]]
    similar = sorted(enumerate(similar), key=lambda item: -item[1])
 
    # Showing most similar documents' numbers and similarity 
    print('Most similar documents')
    print(similar[:6])

    # Showing most similar documents (skipping the first 1 that is our sampled document)
    document_ids = [x[0] for x in similar[1:6]]

    pd.options.display.max_colwidth = 400
    return data.iloc[document_ids, [0, 3, 2]]

#### Testing the query with different models

In [29]:
# LSI

import warnings
warnings.filterwarnings('ignore')

bow = dictionary.doc2bow(search)

print('LSI topic distribution')
print(lsi_model[bow])

query(lsi_model, corpus, bow)

LSI topic distribution
[(0, 23.379401492012587), (1, -7.162298309638588), (2, 2.404254380884587), (3, 3.1458517571369833), (4, 2.6672593673588954), (5, -2.24488047404537), (6, 0.2481443713127864)]
Most similar documents
[(14643, 1.0000001), (14601, 0.9999706), (532, 0.99843967), (12615, 0.99665976), (12596, 0.9961671), (14611, 0.99577427)]


Unnamed: 0,company,job_title,job_description
17692,Aetna,Sr Data Engineer,"Desired: JavaHadoopPythonPOSITION SUMMARY\nThe Sr. Data Engineer will be responsible for developing and managing our big data environment. This individual will work with the data engineering team and be the thought leader and head guru for our data environment.\n\nFundamental Components:\nDevelops large scale data structures and pipelines to organize, collect and standardize data that helps ge..."
661,Dupaco Community Credit Union,Data Engineer,"At Dupaco Community Credit Union, we are a not-for-profit; member-owned financial cooperative that helps our members save money—through lower loan rates, fewer service fees, low-cost insurance, the list goes on and on. As an employee at Dupaco, you’ll be part of an interactive team that believes that by working together, we create better solutions. Outside our branches, employees have the oppo..."
15303,Cox Automotive,Data Engineer I,"Desired: Statistical SoftwareBusiness IntelligenceMicrosoft SQL ServerDesign ExperienceSQLDescription\n\nJob Purpose\nThe Data Engineer I is responsible for processing structured and unstructured data, validating data quality, and developing and supporting data products. The Data Engineer also plays a role in Agile planning, providing advice and guidance, and monitoring emerging technologies.\..."
15281,Cox Automotive,Data Engineer I,"Description\n\nJob Purpose\nThe Data Engineer I is responsible for processing structured and unstructured data, validating data quality, and developing and supporting data products. The Data Engineer also plays a role in Agile planning, providing advice and guidance, and monitoring emerging technologies.\n\nKey Responsibilities\nGathers and processes raw, structured, semi-structured, and unstr..."
17702,Aetna,Lead Data Engineer,"POSITION SUMMARY\nManages and responsible for successful delivery of large scale data structures and Pipelines and efficient Extract/Load/Transform (ETL) workflows. Acts as the data engineering team lead for large and complex projects involving multiple resources and tasks, providing individual mentoring in support of company objectives.\n\nFundamental Components:\nDesigns and develops complex..."


In [30]:
# LDA

print('LDA topic distribution')
print(lda_model[bow])

query(lda_model, corpus, bow)

LDA topic distribution
[(2, 0.41136187), (3, 0.57700545)]
Most similar documents
[(14643, 1.0), (4287, 0.9999317), (14601, 0.9998753), (7444, 0.9995118), (7638, 0.9986998), (21283, 0.9981674)]


Unnamed: 0,company,job_title,job_description
5195,E-InfoSol LLC,Data Scientist,"We are seeking a Data Scientist who is excited about analyzing and solving national security related problems. In this role you will be empowered to:Work directly with analysts to exploit large volumes of dataIdentify important and interesting questions surrounding our customer's challenges, then translate those questions into concrete analytical tasksDevelop tools and process to help uncover ..."
17692,Aetna,Sr Data Engineer,"Desired: JavaHadoopPythonPOSITION SUMMARY\nThe Sr. Data Engineer will be responsible for developing and managing our big data environment. This individual will work with the data engineering team and be the thought leader and head guru for our data environment.\n\nFundamental Components:\nDevelops large scale data structures and pipelines to organize, collect and standardize data that helps ge..."
9084,Advantine Technologies,Business Analyst/Data Analyst,"Job Description\n\nTitle : Business Analyst/Data Analyst\nLocation : Redmond, WA\nNature of employment : Full time permanent OR contract is also fine\nType of position : Managed Service\n\nDescription:\nVery good PBI skills – not just technical, but requirements gathering, visualization design etc\nGood data analysis skills : strong SQL, insights generation, business problem understanding\nGoo..."
9355,Advantine Technologies,Business Analyst/Data Analyst,"Desired: Analysis SkillsSQLJob Description\n\nTitle : Business Analyst/Data Analyst\nLocation : Redmond, WA\nNature of employment : Full time permanent OR contract is also fine\nType of position : Managed Service\n\nDescription:\nVery good PBI skills – not just technical, but requirements gathering, visualization design etc\nGood data analysis skills : strong SQL, insights generation, business..."
25985,"Akraya, Inc.",Data Engineer : 18-03269,"Akraya is looking for a Data Engineer for one of our clients. If the job description below is a fit, please apply directly or call Swapnil at 408 907 3215.\n\nIf this position is not quite what you’re looking for, visit akraya.com and submit a copy of your resume. Our recruiters will get to work finding you a job that is a better match at one of our many clients.\n\nPrimary Skills: SQL,Shell s..."


In [31]:
# HDP

print('HDP topic distribution')
print(hdp_model[bow])

query(hdp_model, corpus, bow)

HDP topic distribution
[(0, 0.6038432608098631), (4, 0.3957622024560999)]
Most similar documents
[(5351, 1.0), (14643, 1.0), (16826, 0.99999887), (14601, 0.99999285), (3795, 0.9999778), (6817, 0.99997467)]


Unnamed: 0,company,job_title,job_description
17742,Aetna,Sr Data Engineer,"POSITION SUMMARY\nThe Sr. Data Engineer will be responsible for developing and managing our big data environment. This individual will work with the data engineering team and be the thought leader and head guru for our data environment.\n\nFundamental Components:\nDevelops large scale data structures and pipelines to organize, collect and standardize data that helps generate insights and addre..."
20393,Aetna,Data Engineer,"Desired: PigHiveHadoopJavaCassandraMySQLNoSQLShell ScriptingPythonPOSITION SUMMARY\nParticipates in the design, build and management of large scale data structures and pipelines and efficient Extract/Load/Transform (ETL) workflows.\n\nFundamental Components: Assists in the development of large scale data structures and pipelines to organize, collect and standardize data that helps generate ins..."
17692,Aetna,Sr Data Engineer,"Desired: JavaHadoopPythonPOSITION SUMMARY\nThe Sr. Data Engineer will be responsible for developing and managing our big data environment. This individual will work with the data engineering team and be the thought leader and head guru for our data environment.\n\nFundamental Components:\nDevelops large scale data structures and pipelines to organize, collect and standardize data that helps ge..."
4600,Amdex Corporation,Data Center Engineer Multiple Openings,"JOB NUMBER: 1276 | LOCATION: SPRINGFIELD, VA | CLEARANCE: CURRENT DHS SUITABILITY PREFERRED; MUST BE CLEARABLE\n\n\nAMDEX is a seasoned high-technology services firm specializing in data sciences and cloud solutions, application development and maintenance, program management, and health IT operations leveraging experience and trusted talent to solve the federal government’s most pressing busi..."
8355,Methodist Le Bonheur Healthcare,IT - Healthcare Information Data Scientist Job,"Summary\n\nFrom a staff position and under limited supervision, the Healthcare Information Data Scientist functions as a member of various multi-disciplanary teams producing innovative solutions driven by exploratory data analysis from complex and high-dimensional databases. The incumbent uses a flexible, analytical approach to design, develop, and evaluate predictive models and advanced algor..."


In [32]:
# LSI + TfIdf

bow_tfidf = tfidf[bow]

print('LSI TfIdf topic distribution')
print(lsi_tfidf_model[bow_tfidf])

query(lsi_tfidf_model, corpus_tfidf, bow_tfidf)

LSI TfIdf topic distribution
[(0, 0.2423767573148917), (1, 0.026115368500704835), (2, 0.017470663719945444), (3, -0.02711574884938205), (4, 0.04957284103564567), (5, -0.013731528125885262), (6, -0.006260854985042585)]
Most similar documents
[(14643, 1.0), (14601, 0.99999243), (16262, 0.996296), (17332, 0.996152), (10154, 0.9954431), (13485, 0.9943684)]


Unnamed: 0,company,job_title,job_description
17692,Aetna,Sr Data Engineer,"Desired: JavaHadoopPythonPOSITION SUMMARY\nThe Sr. Data Engineer will be responsible for developing and managing our big data environment. This individual will work with the data engineering team and be the thought leader and head guru for our data environment.\n\nFundamental Components:\nDevelops large scale data structures and pipelines to organize, collect and standardize data that helps ge..."
19669,Aetna,Lead Data Modeling Engineer,POSITION SUMMARY\nAetna Consumer Health & Products mandate is to transform health and wellness via products that enable consumer engagement and best-in-class experience. This team is responsible for Aetnas digital transformation and all digital products; as well as the development and execution of the digital roadmap. A key part of the responsibility is to build an integrated platform that ena...
21129,Aetna,Lead Data Modeling Engineer,Desired: PigHiveMachine LearningAzureMySQLHadoopKafkaScalaJavaCassandraNoSQLShell ScriptingPythonAWSPOSITION SUMMARY\nAetna Consumer Health & Products mandate is to transform health and wellness via products that enable consumer engagement and best-in-class experience. This team is responsible for Aetnas digital transformation and all digital products; as well as the development and execution ...
12297,TaosMountain,Data Center Site Reliability Engineer,"THIS IS NOT A REMOTE OPPORTUNITY / NO THIRD-PARTY VENDORS\n\nWho is Taos?\nTaos is an IT consulting and services company that offers expertise across the strategic, management and tactical layers of IT and engineering organizations. As part of the nation's IT landscape since 1989, we offer opportunities that will allow you to achieve your career goals and objectives.\n\nWe're changing the face..."
16339,xentity corporation,Data Analyst,Desired: LinuxXMLPerlRSQLJSONC#Software DevelopmentInformaticaMATLABGISPHPSVNJavaScriptRubyPostGISHadoopJavaFTPArcGISOur Government high-profile client is in the second year of investing in a revolutionary way of using open data to promote economic development in the state of Colorado. We are providing data analyst and data wrangling support for their initiative. We are responsible for helping...


In [33]:
# LDA + TfIdf

print('LDA TfIdf topic distribution')
print(lda_tfidf_model[bow_tfidf])

query(lda_tfidf_model, corpus_tfidf, bow_tfidf)

LDA TfIdf topic distribution
[(0, 0.9289356), (1, 0.010748733), (2, 0.010845349), (3, 0.01090914), (4, 0.017038187), (5, 0.0107729845), (6, 0.010750058)]
Most similar documents
[(14601, 1.0), (14643, 1.0), (12591, 0.99999994), (13408, 0.9999999), (6430, 0.99999976), (12666, 0.9999997)]


Unnamed: 0,company,job_title,job_description
17742,Aetna,Sr Data Engineer,"POSITION SUMMARY\nThe Sr. Data Engineer will be responsible for developing and managing our big data environment. This individual will work with the data engineering team and be the thought leader and head guru for our data environment.\n\nFundamental Components:\nDevelops large scale data structures and pipelines to organize, collect and standardize data that helps generate insights and addre..."
15276,Eli Lilly,Consultant Statistician-Computation,"The Statistical Analyst will coordinate and manage all aspects of statistical analysis of clinical trial data in drug development through innovative strategies, close collaboration with medical colleagues, and quality oversight of external work products.\n\nThe primary focus of this role will be to assist with the global implementation of CDISC standards (SDTM and ADaM) and strategic projects ..."
16245,RAYMOND JAMES FINANCIAL,Data Engineer,"Description\n\nABOUT US:\n\nFounded in 1962, Raymond James Financial, Inc. is a Fortune 500 diversified holding company providing financial services to individuals, corporations and municipalities through its subsidiary companies engaged primarily in investment and financial planning, in addition to capital markets and asset management. Headquartered in Florida, Raymond James Financial has app..."
7898,The University of Chicago,Policy Research Analyst,"Job Information\nJob Summary:\n\nCrime Lab New York is seeking a research analyst to work on our portfolio of projects applying statistical inference, econometrics, and machine learning to pressing challenges in public policy and criminal justice. We’re seeking a smart, motivated, and detail-oriented person who is qualified to work on all aspects of a project – ranging from raw data cleaning a..."
15364,The University of Chicago,Policy Analyst (Data Scientist),"Job Information\nJob Summary:\n\nCrime Lab New York is seeking a data scientist to work on our portfolio of projects applying statistical inference, econometrics, and machine learning to pressing challenges in public policy and criminal justice. We’re seeking a smart, motivated, and detail-oriented person who is qualified to work on all aspects of a project – ranging from raw data cleaning and..."


In [34]:
# HDP + TfIdf

print('HDP TfIdf topic distribution')
print(hdp_tfidf_model[bow_tfidf])

query(hdp_tfidf_model, corpus_tfidf, bow_tfidf)

HDP TfIdf topic distribution
[(0, 0.9424892532954624), (1, 0.019477386126806474), (2, 0.013706904398310998)]
Most similar documents
[(588, 1.0000001), (823, 1.0000001), (903, 1.0000001), (1728, 1.0000001), (2240, 1.0000001), (2247, 1.0000001)]


Unnamed: 0,company,job_title,job_description
1024,UnitedHealth Group,Sr Principal Data Scientist,Desired: Machine LearningExperimental DesignRNatural Language ProcessingPythonWe are seeking an experienced Manager in Data Science to drive new insights and lead research efforts. You will work closely with our expert domain scientists and business leaders to identify appropriate research questions and translate them into actionable analyses and algorithms. You will apply advanced statistical...
1133,Applecart,"Data Analyst, Commercial Solutions","Applecart is a data science startup that deploys proprietary technology to run smarter advertising campaigns. We work with some of the nation’s most prominent corporations, non-profit organizations and political candidates to activate and communicate with key target audiences at a scale and level of efficacy previously thought impossible. Our core offering is a proprietary social graph that le..."
2050,Social Standards,Sr. Research Analyst/Manager,Why are face mask sales exploding? Which craft beer brands are gaining popularity in the Midwest? What are consumers looking for in vodka? Is hyaluronic acid the next big thing in beauty? Is Kombucha here to stay? A businesses’ ability to understand the answers to these questions before it’s too late is the key to whether they succeed or fail.Here’s the good news: The answers to these question...
2664,KPMG,"Senior Associate, Big Data Software Engineer","Innovate. Collaborate. Shine. Lighthouse — KPMG's Center of Excellence for Advanced Analytics — has both applied data science, AI, and big data architecture capabilities. Here, you’ll work with a diverse team of sophisticated data and analytics professionals to explore the solutions for clients in a platform-diverse environment. This means your ability to find answers is limited only by your c..."
2671,KPMG,"Machine Learning Engineer, Senior Associate","Innovate. Collaborate. Shine. Lighthouse — KPMG's Center of Excellence for Advanced Analytics — has both applied data science, AI, and big data architecture capabilities. Here, you’ll work with a diverse team of sophisticated data and analytics professionals to explore solutions for clients in a platform-diverse environment. This means your ability to find answers is limited only by your creat..."


In [35]:
# LSI + Bigrams

bow_b = dictionary_b.doc2bow(search)

print('LSI(Bigrams) topic distribution')
print(lsi_model_b[bow_b])

query(lsi_model_b, corpus_b, bow_b)

LSI(Bigrams) topic distribution
[(0, 21.486519217339776), (1, -10.023405915781249), (2, 0.028935065477965486), (3, -2.3196988833080003), (4, 2.034984720715799), (5, 1.6815432389825908), (6, -0.9215057734557258)]
Most similar documents
[(16421, 0.9980832), (16730, 0.9980832), (15802, 0.99801457), (21605, 0.99758554), (21581, 0.9975737), (6329, 0.9971151)]


Unnamed: 0,company,job_title,job_description
20259,Flywire Corporation,Data Engineer,"We, at Flywire, are looking for a smart, analytical thinker who's excited to empower data-driven decision making at an exciting and fast-growing organization! As our Data Engineer, you will work within the Data Analytics team to ensure that our organization has access to reliable, accurate, and timely data to be used in various reporting, business intelligence, and analytical solutions. Great ..."
19153,Flywire Corporation,Data Engineer,"Desired: Business IntelligenceDatabase AdministrationSparkPythonAWSTableauApacheWe, at Flywire, are looking for a smart, analytical thinker who's excited to empower data-driven decision making at an exciting and fast-growing organization! As our Data Engineer, you will work within the Data Analytics team to ensure that our organization has access to reliable, accurate, and timely data to be us..."
26382,Electronic Arts,Senior Software Engineer/Architect – Data & AI,"Desired: HiveData MiningHadoopKafkaCI/CDC/C++SQLJavaSparkSoftware DevelopmentNoSQLShell ScriptingPythonRequisition Number:151529\nLocation:Austin\nDate Opened:2018-07-30\nElectronic Arts Inc. is a leading global interactive entertainment software company. EA delivers games, content and online services for Internet-connected consoles, personal computers, mobile phones and tablets.\nSenior Softw..."
26351,Electronic Arts,Senior Software Engineer/Architect – Data & AI,"Requisition Number:151529\nLocation:Austin\nDate Opened:2018-07-30\nElectronic Arts Inc. is a leading global interactive entertainment software company. EA delivers games, content and online services for Internet-connected consoles, personal computers, mobile phones and tablets.\nSenior Software Engineer/Architect - Data\n\nWe are EA\n\nAnd we make games – how cool is that? In fact, we enterta..."
7780,GoHealth,Data Engineer - Python,"Job Description\n\nGoHealth is looking for Data Engineers who will be responsible for the design, development, and delivery of data transformation tasks used in transforming data into a format that can be easily analyzed. We are seeking candidates who have experience in data analysis, collection, and optimization for the purpose of informing business decisions. The Data Engineer will work with..."


In [37]:
# LDA + Bigrams

print('LDA (Bigrams) topic distribution')
print(lda_model_b[bow_b])

query(lda_model_b, corpus_b, bow_b)

LDA (Bigrams) topic distribution
[(0, 0.34948924), (1, 0.042796515), (2, 0.49942118), (6, 0.10703978)]
Most similar documents
[(12816, 0.9987545), (21459, 0.99766433), (17423, 0.99750376), (9185, 0.9974438), (17675, 0.9972875), (17969, 0.9972794)]


Unnamed: 0,company,job_title,job_description
26211,Novus,Data Engineer,"Austin, TX\nNovus is an innovative company that is changing the way the world invests. We are a high growth, disruptive technology firm bringing big data analytics to the alternative investment industry. Our platform enables investors to consistently maximize their performance potential through discovery of true investment acumen and risk, proprietary industry insights and expertise, and effor..."
21238,FairWarning,Application Data Analyst,Desired: LinuxInformation SecurityMachine LearningMicrosoft OfficeSQLJSONWorkdayBig DataData WarehouseShell ScriptingGoogle SuitePythonYour Unlimited Career Growth Opportunity\n\nFairWarning is hiring an Application Data Analyst to contribute to the product platform by providing hands-on data analysis of different applications. FairWarning solutions protect organizations of all sizes against d...
11184,Harnham,Senior Data Scientist - Medical Informatics,"Senior Data Scientist (Medical Informatics) - HealthTech\nSan Francisco Bay area, CA\n$110,000-$150,000 (depending on experience)\nA series B HealthTech client of mine is applying big data-driven medical science to patient data in order to optimize every health care interaction and reduce patient risk, and they are continuing to grow their capacity in data science! If you are an experienced da..."
21528,Johns Hopkins Hospital/Johns Hopkins Health System,Data Engineer (Epic Data Warehouse),"Data Engineer (Epic Data Warehouse)\nRequisition #: 172888\nLocation: Johns Hopkins Hospital/Johns Hopkins Health System, Baltimore, MD\n\nCategory: Information Technology\nWork Shift: Day Shift\nWork Week: Full Time (40 hours)\nWeekend Work Required: Yes\nDate Posted: July 24, 2018\nJohns Hopkins Health System employs more than 20,000 people annually. Upon joining Johns Hopkins Health System,..."
21874,Johns Hopkins Hospital/Johns Hopkins Health System,Data Engineer (Epic Data Warehouse),"Desired: Microsoft SQL ServerEpicRC/C++Crystal ReportsSQLPHPIISSDLCSAPCI/CDTableauEpic CertificationJavaSharePointData WarehousePythonData Engineer (Epic Data Warehouse)\nRequisition #: 172888\nLocation: Johns Hopkins Hospital/Johns Hopkins Health System, Baltimore, MD\n\nCategory: Information Technology\nWork Shift: Day Shift\nWork Week: Full Time (40 hours)\nWeekend Work Required: Yes\nDate ..."


In [38]:
# HDP + Bigrams

print('HDP (Bigrams) topic distribution')
print(hdp_model_b[bow_b])

query(hdp_model_b, corpus_b, bow_b)

HDP (Bigrams) topic distribution
[(0, 0.9784367104541162), (1, 0.020976160703426058)]
Most similar documents
[(12077, 1.0), (5724, 0.9999997), (17566, 0.999999), (20918, 0.99999845), (5628, 0.9999981), (5802, 0.9999981)]


Unnamed: 0,company,job_title,job_description
7041,DCS Corp,Operations Research Analyst,"Infoscitex, a DCS company, is an employee owned organization with a reputation for agile and efficient development of technology solutions for U.S. Defense, Aerospace, Human Factors, and Security markets. We continue to provide innovative solutions with our multidisciplinary teams formed with exceptional employees. Infoscitex is seeking an Operations Research (OR) Analyst to support extensive ..."
21399,MaxisIT Inc,Statistician,"Job Description: This is a full-time opportunity with MaxisIT.Duties: Under the direction of the Manager of Biostatistics, the successful candidate will take the lead statistical role on several clinical trials by providing study design expertise via protocol development and review of case report forms, sample size calculations, randomization, and the development of statistical analysis plans ..."
25535,ETCS-Inc.com,Statistician,"Hello Folks,Looking for BiostatisticianJob Type: ContractExperience:SAS: 1 year (Preferred)"
6924,The Creative Group,Marketing Research Analyst,Desired: SPSSQualtricsTCG is working with a well-known company in St. Louis that is looking for a Market Research Analyst for a short-term contract assignment.\n\nThe Market Research Analyst is responsible for developing and executing strategic market and branch insight projects that generate actionable insights to help better service clients. Projects may vary from identifying insights for cl...
7129,A-Line Staffing,Senior Data Scientist,"A-Line Staffing is seeking a qualified candidate for the position of Senior Data Scientist located in St. Bernard, OH.This is a direct hire opportunity with relocation benefits available!Please review the desired qualifications listed below and apply or contact Greg Wagner with questions.DescriptionAs a Senior Data Scientist, you'll be using Big Data and advanced analytics to direct the engage..."


### Looking into query results

Besides jobs and tasks descriptions, our texts include some irrelevant information about companies, teams or benefits, as well as different kinds of disclaimers. Most of the models are able to catch this, grouping words like "gender", "religion", "origin" or "age" together.<br> 

Company information is usually the same piece of text for all jobs in this company. That is why having a smaller number of topics is more preferable in our case, since we want recomendations to be based on required qualifications, not company descriptions. And that is why HDP models with large number of topics are less accurate. <br>

For this same reason, TfIdf models' recomendations are more company-oriented, since tfidf transformation brings words from companies descriptions to the surface. If we check the lists of most important words, we see that there are a lot of company names there. The models group all qualification descriptions to one big topic, while the remaining topics are much smaller and very company-specific. <br>

For our specific test case, LSI model seems to have the best query results, showing Data Engineering jobs similar to the one from the query, from several different companies.
