In [1]:
import pandas as pd
import numpy as np
# do the LDA
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.parsing.preprocessing import STOPWORDS
import pprint
import nltk
from nltk import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
df_BA=pd.read_csv('BA')
df_DS=pd.read_csv('DS')
df_DA=pd.read_csv('DA')
df_DE=pd.read_csv('DE')

In [3]:
my_stop_words = STOPWORDS.union(set(['experience', 'work','requirements','ability','years','analyst',
                                     'required','including','best','new','applicants','jobs','candidate',
                                     'help','regard','qualified','employment','consideration','applications',
                                     'position','able','application','role'
                                    ]))

In [7]:
def word_freq_filter(df):
    l1=""
    for i in df_BA.index:
        sents=df_BA.loc[i]['JD']
        l1+=(sents.replace('\n',''))
    word_tokens = word_tokenize(l1)
    stop_words = set(stopwords.words('english')) 
    wordlist=[word for word in word_tokens if word.isalnum() and word not in stop_words]
    full_text=""
    for word in wordlist:
        full_text+=word+" "
    allWords = nltk.tokenize.word_tokenize(full_text)
    allWordDist = nltk.FreqDist(w.lower() for w in allWords)
    mostCommon= allWordDist.most_common(500)
    common_words = []
    for item in mostCommon:
        common_words.append(item[0])
    leastCommon= allWordDist.most_common()[:-100-1:-1]
    least_words = []
    for item in leastCommon:
        least_words.append(item[0])
    return common_words+least_words

# LDA Model

In [8]:
def get_df_name(df):
    name =[x for x in globals() if globals()[x] is df][0]
    return name

def lda(df,num_topics = 3,passes = 30,num_words=8):
    l1=[]
    for i in df.index:
        sents=df.loc[i]['JD']
        l1.append(sents.replace('\n',''))
    
    texts = [[word for word in story.lower().split()
            if word not in my_stop_words and word not in freq_words and word.isalnum()]
            for story in l1]
    dictionary = corpora.Dictionary(texts) #(word_id,word) pairs
    #dictionary.filter_extremes(no_below=20,no_above=0.2, keep_n= 100000)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    
    lda = LdaModel(corpus,
              id2word=dictionary,
              num_topics=num_topics,
              passes=passes)
    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(lda.print_topics(num_words))
    
    return lda

In [13]:
freq_words=word_freq_filter(df_BA)
BA_lda=lda(df_BA).show_topic(0)
BA_lda

[   (   0,
        '0.002*"credit" + 0.002*"concise" + 0.002*"corporate" + '
        '0.002*"feedback" + 0.002*"challenges" + 0.002*"asset" + '
        '0.002*"physical" + 0.002*"mortgage" + 0.002*"party" + '
        '0.002*"integrate"'),
    (   1,
        '0.002*"â" + 0.002*"preparing" + 0.002*"custom" + 0.002*"junior" + '
        '0.002*"core" + 0.002*"maintains" + 0.002*"available" + '
        '0.002*"exposure" + 0.002*"sprint" + 0.002*"essential"'),
    (   2,
        '0.002*"human" + 0.002*"texas" + 0.002*"pricing" + 0.002*"medicaid" + '
        '0.002*"medical" + 0.002*"day" + 0.002*"assistance" + '
        '0.002*"influence" + 0.002*"navision" + 0.001*"vendor"')]


('credit', 0.002144061)

In [14]:
freq_words=word_freq_filter(df_DA)
DA_lda=lda(df_DA).show_topic(0)
DA_lda

[   (   0,
        '0.003*"government" + 0.003*"public" + 0.003*"medical" + '
        '0.002*"clearance" + 0.002*"assistance" + 0.002*"secret" + '
        '0.002*"clinical" + 0.002*"programming" + 0.002*"statistical" + '
        '0.002*"dod"'),
    (   1,
        '0.006*"visualization" + 0.005*"bi" + 0.005*"power" + '
        '0.004*"quantitative" + 0.004*"statistical" + 0.004*"actionable" + '
        '0.003*"python" + 0.003*"sets" + 0.003*"decision" + 0.003*"etl"'),
    (   2,
        '0.028*"statistical" + 0.011*"interpret" + 0.010*"packages" + '
        '0.009*"sources" + 0.008*"programming" + 0.008*"techniques" + '
        '0.007*"collection" + 0.006*"sas" + 0.006*"statistics" + '
        '0.006*"datasets"')]


[('government', 0.0034555565),
 ('public', 0.0031454507),
 ('medical', 0.003044649),
 ('clearance', 0.002495837),
 ('assistance', 0.0022639974),
 ('secret', 0.0021883913),
 ('clinical', 0.0020776147),
 ('programming', 0.002007039),
 ('statistical', 0.0019959344),
 ('dod', 0.0019720471)]

In [15]:
freq_words=word_freq_filter(df_DS)
DS_lda=lda(df_DS).show_topic(0)
DS_lda

[   (   0,
        '0.023*"machine" + 0.021*"learning" + 0.016*"statistical" + '
        '0.010*"quantitative" + 0.008*"predictive" + 0.007*"techniques" + '
        '0.006*"scientist" + 0.006*"deep" + 0.006*"algorithms" + 0.005*"ml"'),
    (   1,
        '0.014*"statistical" + 0.008*"analytic" + 0.008*"scientist" + '
        '0.005*"visualization" + 0.005*"machine" + 0.005*"clinical" + '
        '0.005*"programming" + 0.004*"courses" + 0.004*"social" + '
        '0.004*"predictive"'),
    (   2,
        '0.021*"quantum" + 0.009*"algorithms" + 0.006*"qiskit" + '
        '0.006*"computers" + 0.005*"career" + 0.005*"diverse" + 0.005*"linear" '
        '+ 0.005*"looking" + 0.005*"employee" + 0.005*"optimization"')]


[('machine', 0.023140399),
 ('learning', 0.02115203),
 ('statistical', 0.016178768),
 ('quantitative', 0.00981468),
 ('predictive', 0.0077376217),
 ('techniques', 0.007384834),
 ('scientist', 0.0061968807),
 ('deep', 0.0060998835),
 ('algorithms', 0.005533578),
 ('ml', 0.0054710247)]

In [16]:
freq_words=word_freq_filter(df_DE)
DE_lda=lda(df_DE).show_topic(0)
DE_lda

[   (   0,
        '0.007*"pipelines" + 0.007*"big" + 0.007*"etl" + 0.007*"python" + '
        '0.006*"machine" + 0.006*"learning" + 0.005*"engineer" + '
        '0.005*"warehousing" + 0.004*"programming" + 0.004*"distributed"'),
    (   1,
        '0.016*"big" + 0.011*"aws" + 0.009*"etl" + 0.009*"engineer" + '
        '0.006*"spark" + 0.006*"infrastructure" + 0.005*"pipeline" + '
        '0.005*"python" + 0.005*"relational" + 0.005*"programming"'),
    (   2,
        '0.010*"azure" + 0.009*"big" + 0.008*"pipelines" + 0.007*"aws" + '
        '0.006*"programming" + 0.006*"python" + 0.006*"machine" + '
        '0.005*"learning" + 0.005*"spark" + 0.005*"streaming"')]


[('pipelines', 0.0070289895),
 ('big', 0.006977768),
 ('etl', 0.0068747825),
 ('python', 0.0065189097),
 ('machine', 0.0056351507),
 ('learning', 0.005504947),
 ('engineer', 0.0050493325),
 ('warehousing', 0.004562349),
 ('programming', 0.0044694417),
 ('distributed', 0.0044409274)]

In [17]:
import csv

In [18]:
with open('LDA_List', 'w') as f:
      
    write = csv.writer(f)
      
    write.writerow(BA_lda)
    write.writerow(DA_lda)
    write.writerow(DS_lda)
    write.writerow(DE_lda)

# Match Topic to Document

In [None]:
from operator import itemgetter
lda.get_document_topics(corpus_new[0],minimum_probability=0.05,per_word_topics=False)
sorted(lda.get_document_topics(corpus_new[0],minimum_probability=0,per_word_topics=False),key=itemgetter(1),reverse=True)