In [1]:
import pandas as pd
import numpy as np
import nltk

In [13]:
data = pd.read_csv('data/job_posts.csv', usecols=['Title', 'Company', 
                                                  'JobDescription', 'JobRequirement', 'RequiredQual'])
data.head(5)

Unnamed: 0,Title,Company,JobDescription,JobRequirement,RequiredQual
0,Chief Financial Officer,AMERIA Investment Consulting Company,AMERIA Investment Consulting Company is seekin...,- Supervises financial management and administ...,"To perform this job successfully, an\r\nindivi..."
1,Full-time Community Connections Intern (paid i...,International Research & Exchanges Board (IREX),,,- Bachelor's Degree; Master's is preferred;\r\...
2,Country Coordinator,Caucasus Environmental NGO Network (CENN),Public outreach and strengthening of a growing...,- Working with the Country Director to provide...,"- Degree in environmentally related field, or ..."
3,BCC Specialist,Manoff Group,The LEAD (Local Enhancement and Development fo...,- Identify gaps in knowledge and overseeing in...,"- Advanced degree in public health, social sci..."
4,Software Developer,Yerevan Brandy Company,,- Rendering technical assistance to Database M...,- University degree; economical background is ...


In [14]:
data.isnull().sum()

Title               28
Company              7
JobDescription    3892
JobRequirement    2522
RequiredQual       484
dtype: int64

In [15]:
data = data.dropna()
data['Title'].value_counts()[:10]

Accountant                                216
Medical Representative                    151
Chief Accountant                          151
Sales Manager                             126
Administrative Assistant                  124
Lawyer                                    115
Project Manager                            94
Software Developer                         79
Web Developer                              74
Receptionist/ Administrative Assistant     73
Name: Title, dtype: int64

In [16]:
data = data[data['Title'].str.match('Software Developer|Data Analyst|Software Engineer' + 
                                          '|Web Developer|Web Designer')]
data.reset_index(drop=True, inplace=True)
data.head(5)

Unnamed: 0,Title,Company,JobDescription,JobRequirement,RequiredQual
0,Software Developer,"Synergy International Systems, Inc./Armenia","Synergy International Systems, Inc./Armenia se...",Specific tasks and key responsibilities includ...,"- Degree in Computer Science, Information Tech..."
1,Web Designer,ACRA Credit Bureau,ACRA Credit Bureau seeks to fill the position ...,Translate into Armenian and Russian a web-site...,The successful candidate will demonstrate the\...
2,Web Developer/ Programmer,"""Click"" Web Design",The Web Developer/ Programmer will develop int...,The Web Developer/ Programmer will be responsi...,- At least 2 years experience as a Web Develop...
3,Web Designer,"""Click"" Web Design",The Web Designer will build flash based websites.,The Web Designer will be responsible for creat...,- At least 2 years experience as a web designe...
4,Software Developer,"Synergy International Systems, Inc. - Armenia",The responsibilities of this position are focu...,Specific tasks and key responsibilities includ...,"- Degree in Computer Science, Information Tech..."


In [20]:
data.Title.value_counts()[:5]

Software Developer    79
Web Developer         74
Software Engineer     42
Web Designer          34
Data Analyst           5
Name: Title, dtype: int64

In [32]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def remove_punc(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    lemmatizer.lemmatize(text)
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_doc = [w for w in word_tokens if not w in stop_words]
    return filtered_doc

In [34]:
test = data.assign(desc=pd.Series(data.apply(lambda x: x.JobDescription + x.JobRequirement + x.RequiredQual, axis=1)))

In [41]:
document = ""
for i in test.desc:
    document += i

In [46]:
from collections import Counter

document = remove_punc(document.lower())
keywordList = preprocess(document)
ctr = Counter(keywordList)
ctr.most_common(10)

[('knowledge', 880),
 ('experience', 808),
 ('software', 679),
 ('web', 608),
 ('work', 543),
 ('development', 539),
 ('design', 491),
 ('good', 379),
 ('ability', 371),
 ('skills', 339)]

In [48]:
!pip install -U gensim

Collecting gensim
  Downloading https://files.pythonhosted.org/packages/0b/66/04faeedb98bfa5f241d0399d0102456886179cabac0355475f23a2978847/gensim-3.8.3-cp37-cp37m-win_amd64.whl (24.2MB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading https://files.pythonhosted.org/packages/ea/54/01525817b6f31533d308968b814999f7e666b2234f39a55cbe5de7c1ff99/smart_open-4.1.2-py3-none-any.whl (111kB)
Collecting Cython==0.29.14 (from gensim)
  Downloading https://files.pythonhosted.org/packages/1f/be/b14be5c3ad1ff73096b518be1538282f053ec34faaca60a8753d975d7e93/Cython-0.29.14-cp37-cp37m-win_amd64.whl (1.7MB)
Installing collected packages: smart-open, Cython, gensim
  Found existing installation: Cython 0.28.5
    Uninstalling Cython-0.28.5:
      Successfully uninstalled Cython-0.28.5
Successfully installed Cython-0.29.14 gensim-3.8.3 smart-open-4.1.2


In [49]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *



In [50]:
def lemmatize_stemming(text):
    stemmer = SnowballStemmer('english')
    lemmatizer = WordNetLemmatizer()
    return stemmer.stem(lemmatizer.lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [63]:
dictionary = gensim.corpora.Dictionary(test['desc'].map(preprocess))

In [64]:
i = 0
for k, v in dictionary.iteritems():
    if i < 10:
        print(k, v)
        i += 1
    else:
        break

0 abil
1 addit
2 analyt
3 armenia
4 aspect
5 assur
6 attent
7 bachelor
8 busi
9 candid


In [65]:
len(dictionary)

1619

In [66]:
dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=100000)

In [67]:
len(dictionary)

388

In [70]:
bow_corpus = [dictionary.doc2bow(doc) for doc in test['desc'].map(preprocess)]

In [73]:
from gensim import corpora, models
from pprint import pprint

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [74]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [75]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.019*"degre" + 0.018*"cod" + 0.018*"person" + 0.017*"plus" + 0.016*"motiv" + 0.015*"orient" + 0.015*"server" + 0.014*"technic" + 0.014*"high" + 0.014*"understand"
Topic: 1 
Words: 0.023*"technic" + 0.023*"server" + 0.019*"relat" + 0.019*"product" + 0.016*"project" + 0.014*"problem" + 0.013*"familiar" + 0.013*"manag" + 0.012*"look" + 0.012*"databas"
Topic: 2 
Words: 0.017*"provid" + 0.016*"system" + 0.015*"write" + 0.015*"technic" + 0.015*"relat" + 0.014*"technolog" + 0.013*"code" + 0.012*"engin" + 0.011*"manag" + 0.011*"javascript"
Topic: 3 
Words: 0.028*"plus" + 0.022*"project" + 0.021*"framework" + 0.019*"data" + 0.018*"creat" + 0.014*"databas" + 0.014*"network" + 0.014*"document" + 0.013*"unix" + 0.013*"respons"
Topic: 4 
Words: 0.024*"plus" + 0.019*"degre" + 0.019*"respons" + 0.016*"russian" + 0.016*"relat" + 0.015*"websit" + 0.014*"technic" + 0.012*"technolog" + 0.012*"armenian" + 0.012*"problem"
Topic: 5 
Words: 0.016*"javascript" + 0.016*"excel" + 0.014*"projec

In [76]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.014*"content" + 0.011*"data" + 0.011*"websit" + 0.011*"sit" + 0.008*"creat" + 0.008*"engin" + 0.008*"compani" + 0.008*"specif" + 0.008*"updat" + 0.007*"quick"
Topic: 1 Word: 0.019*"physic" + 0.013*"math" + 0.010*"algorithm" + 0.010*"layout" + 0.010*"prefer" + 0.009*"oblig" + 0.009*"militari" + 0.009*"previous" + 0.009*"implement" + 0.009*"plus"
Topic: 2 Word: 0.012*"thing" + 0.010*"give" + 0.010*"mobil" + 0.009*"practic" + 0.009*"jqueri" + 0.009*"implement" + 0.009*"bank" + 0.008*"task" + 0.008*"solut" + 0.008*"modifi"
Topic: 3 Word: 0.011*"cod" + 0.010*"familiar" + 0.009*"member" + 0.009*"test" + 0.008*"provid" + 0.008*"framework" + 0.008*"technic" + 0.008*"databas" + 0.008*"document" + 0.007*"object"
Topic: 4 Word: 0.010*"financi" + 0.009*"player" + 0.009*"layout" + 0.008*"accept" + 0.008*"person" + 0.008*"project" + 0.008*"system" + 0.008*"masteri" + 0.008*"sourc" + 0.008*"understand"
Topic: 5 Word: 0.022*"cycl" + 0.021*"librari" + 0.019*"algorithm" + 0.018*"data" +

In [77]:
for index, score in sorted(lda_model[bow_corpus[20]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.9901075959205627	 
Topic: 0.020*"data" + 0.019*"relat" + 0.019*"product" + 0.018*"excel" + 0.016*"technic" + 0.015*"implement" + 0.015*"task" + 0.014*"algorithm" + 0.014*"strong" + 0.013*"write"


In [87]:
unseen_document = r'''
Under general supervision, formulates design\r\nstrategies, and participates in the
strategic planning of web site goals\r\nand objectives.- Participates in the overall
design structuring of the web sites;\r\norganizes and maintains the sites.
\r\n- Develops and implements plans to obtain and maintain a high level of\r\nfunctionality,
usability, and design structure for the web sites. \r\n- Assesses new standards, technologies 
and trends, and formulates\r\nstrategies and plans for future enhancement of web sites.
\r\n- Develops, and coordinates the creation of comprehensive graphic\r\nlayouts and elements
for new sections and/or features on the sites.- Strong proficiency with HTM/HTML, Dreamweaver,
Flash Technology,\r\nPhotoshop, Java-Script, CSS;\r\n- Familiarity with web templates;\r\n-
Advanced knowledge and understanding of web-based graphic design and\r\nlayout; \r\n-
Web planning and organizing skills;\r\n- Ability to evaluate new and evolving website
technologies; \r\n- Knowledge of a comprehensive range of web programming software and\r\nauthoring
languages; \r\n- Knowledge and understanding of internet operations and functionality,\r\nand of
a wide range of internet programming and design tools. \r\n- Web design experience and portfolio;
\r\n- Creation of work using your own innovations and by following the\r\nguidance of managers and
colleagues; \r\n- Self-organized and detailed oriented;\r\n- Strong inter-personal and communication
skills;\r\n- Efficient when under pressure;\r\n- Able to work independently;\r\n- Able to multi-task,
and adapt to flexible timelines.
'''
op = []
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    op.append("Score: {} Topic: {}".format(score, lda_model.print_topic(index, 5)))

In [88]:
op

['Score: 0.3178059756755829 Topic: 0.016*"javascript" + 0.016*"excel" + 0.014*"project" + 0.013*"respons" + 0.012*"high"',
 'Score: 0.27502545714378357 Topic: 0.018*"technolog" + 0.016*"manag" + 0.015*"engin" + 0.015*"websit" + 0.015*"technic"',
 'Score: 0.2566033601760864 Topic: 0.020*"plus" + 0.020*"write" + 0.019*"desir" + 0.019*"familiar" + 0.019*"technolog"',
 'Score: 0.1427709013223648 Topic: 0.019*"degre" + 0.018*"cod" + 0.018*"person" + 0.017*"plus" + 0.016*"motiv"']

In [89]:
lda_model.save('lda.model')