In [1]:
%pylab inline 
import nltk
import ujson
import re
import time
import progressbar

import pandas as pd
from __future__ import print_function
from six.moves import zip, range 

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, roc_auc_score, auc
from sklearn import preprocessing
from collections import Counter, OrderedDict
from nltk.corpus import stopwords
from nltk import PorterStemmer

nltk.download('stopwords') #download the latest stopwords

Populating the interactive namespace from numpy and matplotlib
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vinee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df = pd.read_csv("jobs_subset.csv")

In [3]:
df.head()

Unnamed: 0,normalizedTitle_onetName,normalizedTitle_onetCode,jobDescription,title
0,Lawyers,23-1011.00,This position is in support of the United Stat...,"Paralegal (Buffalo, NY)"
1,Software Quality Assurance Engineers and Testers,15-1199.01,The successful candidate will maintain and dev...,System Integration / Test Engineer
2,Software Quality Assurance Engineers and Testers,15-1199.01,This position is a Senior System Integration &...,System Integration / Test Engineer Staff
3,Software Quality Assurance Engineers and Testers,15-1199.01,"In this position, candidates may participate i...",Mult Func Fin Analyst
4,Software Quality Assurance Engineers and Testers,15-1199.01,This is a business operations analyst position...,Mult Func Fin Analyst


In [13]:
df['normalizedTitle_onetName'].value_counts()

array(['Lawyers', 'Software Quality Assurance Engineers and Testers',
       'Graduate Teaching Assistants', 'Financial Examiners',
       'Credit Analysts'], dtype=object)

In [16]:
df['title'].unique().shape

(2496,)

In [42]:
def create_bag_of_words(corpus, NGRAM_RANGE = (0,1), stop_words = None, stem= False, MIN_DF = 0.05, MAX_DF = 0.95, USE_IDF = False):
    
    ANALYZER = 'word'
    STRIP_ACCENTS = 'unicode'
    
    if stem:
        tokenize = lambda x: [stemmer.stem(i) for i in x.split()]
    else:
        tokenize = None
    vectorizer = CountVectorizer(analyzer = ANALYZER, tokenizer = tokenize, ngram_range=NGRAM_RANGE, stop_words=stop_words, max_df = MAX_DF, min_df = MIN_DF, strip_accents=STRIP_ACCENTS)
    bag_of_words = vectorizer.fit_transform(corpus)
    features = vectorizer.get_feature_names()
    
    if USE_IDF:
        NORM = None
        SMOOTH_IDF = True
        SUBLINEAR_IDF = True
        transformer = TfidfTransformer(norm = NORM, smooth_idf = SMOOTH_IDF, sublinear_tf=True)
        tfidf = transformer.fit_transform(bag_of_words)
        
        return tfidf, features
    else:
        return bag_of_words, features
        

In [5]:
test_words = ['this is the shit', 'I am tryna put you in a worst mood', 'P-1 cleaner than your church shooes', "I am a starboy"]

In [6]:
test_bag_of_words, test_features = create_bag_of_words(test_words)

In [7]:
np_test_bag_of_words = test_bag_of_words.toarray()
np_test_bag_of_words

array([[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0],
       [1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0],
       [0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [8]:
test_features

['am',
 'church',
 'cleaner',
 'in',
 'is',
 'mood',
 'put',
 'shit',
 'shooes',
 'starboy',
 'than',
 'the',
 'this',
 'tryna',
 'worst',
 'you',
 'your']

In [12]:
toy_corpus = ['this is document one', 'this is document two', 'text analysis on documents is fun']

In [13]:
toy_bag_of_words, toy_features = create_bag_of_words(toy_corpus)

In [14]:
np_bag_of_words = toy_bag_of_words.toarray()
np_bag_of_words

array([[0, 1, 0, 0, 0, 1, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 0, 1, 1],
       [1, 0, 1, 1, 1, 0, 1, 0, 0]], dtype=int64)

In [15]:
def get_word_count(bag_of_words, feature_names):
    
    np_bag_of_words = bag_of_words.toarray()
    word_count = np_bag_of_words.sum(axis = 0)
    np_word_count = np.asarray(word_count).ravel()
    dict_word_counts = dict(zip(feature_names,np_word_count))
    
    ordered_word_count = OrderedDict(sorted(dict_word_counts.items(), key = lambda x:x[1], reverse = True),)
    
    return ordered_word_count

In [16]:
get_word_count(toy_bag_of_words, toy_features)

OrderedDict([('document', 2),
             ('this', 2),
             ('analysis', 1),
             ('documents', 1),
             ('fun', 1),
             ('on', 1),
             ('one', 1),
             ('text', 1),
             ('two', 1)])

In [17]:
corpus = df['jobDescription'].values

In [18]:
test_words, test_features = create_bag_of_words(corpus)

In [19]:
def create_topics(tfidf, features, N_TOPICS = 3, N_TOP_WORDS = 5,):
    
    #with progressbar.ProgressBar(max_value=progressbar.UnknownLength) as bar:
    #i = 0
    lda = LatentDirichletAllocation(n_topics=N_TOPICS, learning_method='online')

    #bar.update(i)
    #i += 1
    doctopic = lda.fit_transform(tfidf)
    #bar.update(i)
    #i+=1

    ls_keywords = []

    for i, topic in enumerate(lda.components_):
        word_idx = np.argsort(topic)[::-1][:N_TOP_WORDS]
        keywords = ", ".join(features[i] for i in word_idx)
        ls_keywords.append(keywords)
        print(i,keywords)
        #bar.update(i)
        #i+=1
    return ls_keywords, doctopic

In [20]:
corpus_bag_of_words, corpus_features = create_bag_of_words(corpus)

In [21]:
corpus_features

['00',
 '000',
 '10',
 '100',
 '11',
 '12',
 '15',
 '20',
 '2015',
 '25',
 '30',
 '40',
 '50',
 '500',
 '80',
 '800',
 'aa',
 'abilities',
 'ability',
 'able',
 'about',
 'above',
 'academic',
 'access',
 'accordance',
 'according',
 'accounting',
 'accounts',
 'accredited',
 'accuracy',
 'accurate',
 'achieve',
 'across',
 'act',
 'action',
 'active',
 'activities',
 'addition',
 'additional',
 'administration',
 'administrative',
 'admired',
 'advance',
 'advanced',
 'affirmative',
 'again',
 'against',
 'age',
 'agencies',
 'agency',
 'agile',
 'agreeing',
 'al',
 'all',
 'along',
 'also',
 'an',
 'analysis',
 'analyst',
 'analytical',
 'analyze',
 'and',
 'anos',
 'any',
 'applicable',
 'applicants',
 'application',
 'applications',
 'apply',
 'applying',
 'appropriate',
 'architecture',
 'are',
 'area',
 'areas',
 'around',
 'as',
 'aspects',
 'assessment',
 'assigned',
 'assist',
 'assistance',
 'assistant',
 'assisting',
 'assists',
 'associate',
 'associated',
 'assurance',
 'a

In [22]:
get_word_count(corpus_bag_of_words, corpus_features)

OrderedDict([('and', 72036),
             ('to', 36758),
             ('the', 34940),
             ('of', 31793),
             ('in', 23879),
             ('with', 16877),
             ('for', 15605),
             ('or', 13556),
             ('experience', 11108),
             ('is', 10153),
             ('de', 8563),
             ('as', 8252),
             ('be', 7101),
             ('on', 7003),
             ('work', 6583),
             ('our', 6556),
             ('will', 5949),
             ('are', 5769),
             ('an', 5686),
             ('job', 5631),
             ('software', 5360),
             ('skills', 5337),
             ('we', 5198),
             ('development', 5079),
             ('you', 4884),
             ('test', 4744),
             ('this', 4584),
             ('team', 4450),
             ('all', 4442),
             ('that', 4267),
             ('ability', 4248),
             ('other', 4207),
             ('required', 4147),
             ('business', 4010),
   

In [23]:
ls_keywords, corpus_doctopic = create_topics(corpus_bag_of_words, corpus_features)



0 and, to, the, of, in
1 de, en, la, con, el
2 and, to, the, of, in


In [24]:
REGEX_THAT = r'\W+|\d+'

processed_corpus = np.array([re.sub(REGEX_THAT, ' ', comment).lower() for comment in corpus])

In [25]:
corpus[1]

"The successful candidate will maintain and develop electronic production and development test solutions. Provide hands on hardware/software troubleshooting and upgrade support for existing production test stations in a real time environment. Participate in the design, build, integration and validation of automated and manual test equipment. Interpret and understand test requirements, develop/modify test procedures, test reports and other test related documentation. Must be able to interface with the customer during planned meetings and support required presentations. The successful candidate will also have the ability and willingness to on occasion support our Product Support group in maintaining, troubleshooting and servicing electronic fielded equipment. Participate in the integration, installation and validation of fielded equipment. Interpret and understand repair procedures and other repair related documentation. Must be able to interface with the customer during field service tr

In [26]:
processed_corpus[1]

'the successful candidate will maintain and develop electronic production and development test solutions provide hands on hardware software troubleshooting and upgrade support for existing production test stations in a real time environment participate in the design build integration and validation of automated and manual test equipment interpret and understand test requirements develop modify test procedures test reports and other test related documentation must be able to interface with the customer during planned meetings and support required presentations the successful candidate will also have the ability and willingness to on occasion support our product support group in maintaining troubleshooting and servicing electronic fielded equipment participate in the integration installation and validation of fielded equipment interpret and understand repair procedures and other repair related documentation must be able to interface with the customer during field service trips and intern

In [27]:
tokens = processed_corpus[1].split()


In [28]:
eng_stopwords = stopwords.words('english')

In [29]:
processed_bag_of_words, processed_features = create_bag_of_words(processed_corpus, stop_words=eng_stopwords)

In [30]:
processed_word_counts = get_word_count(processed_bag_of_words,processed_features)

In [31]:
processed_word_counts

OrderedDict([('experience', 11108),
             ('de', 8575),
             ('work', 6583),
             ('job', 5631),
             ('software', 5360),
             ('skills', 5337),
             ('development', 5079),
             ('test', 4744),
             ('team', 4450),
             ('ability', 4248),
             ('required', 4147),
             ('business', 4010),
             ('quality', 3664),
             ('requirements', 3653),
             ('position', 3494),
             ('testing', 3413),
             ('knowledge', 3366),
             ('years', 3336),
             ('management', 3320),
             ('must', 3261),
             ('systems', 3103),
             ('engineering', 3047),
             ('services', 3043),
             ('information', 2989),
             ('support', 2955),
             ('legal', 2816),
             ('en', 2791),
             ('status', 2757),
             ('opportunity', 2734),
             ('including', 2627),
             ('working', 2559),
   

In [32]:
TOP_20 = list(processed_word_counts.keys())[:20]
biskut_stopwords = eng_stopwords + TOP_20 

processed_bag_of_words, processed_features = create_bag_of_words(processed_corpus, stop_words=biskut_stopwords)

processed_word_counts = get_word_count(processed_bag_of_words, processed_features)

In [33]:
processed_word_counts

OrderedDict([('systems', 3103),
             ('engineering', 3047),
             ('services', 3043),
             ('information', 2989),
             ('support', 2955),
             ('legal', 2816),
             ('en', 2791),
             ('status', 2757),
             ('opportunity', 2734),
             ('including', 2627),
             ('working', 2559),
             ('design', 2531),
             ('company', 2528),
             ('technical', 2487),
             ('data', 2359),
             ('provide', 2307),
             ('time', 2297),
             ('degree', 2276),
             ('related', 2264),
             ('new', 2242),
             ('strong', 2206),
             ('environment', 2105),
             ('qualifications', 2103),
             ('system', 2072),
             ('compliance', 2019),
             ('equal', 1993),
             ('preferred', 1925),
             ('location', 1899),
             ('training', 1839),
             ('employer', 1799),
             ('product', 179

In [34]:
processed_keywords, processed_doctopics = create_topics(processed_bag_of_words, processed_features)



0 engineering, systems, design, status, technical
1 en, la, con, el, experiencia
2 legal, compliance, company, provide, time


In [36]:
processed_keywords, processed_doctopics = create_topics(processed_bag_of_words, processed_features, N_TOPICS = 10, N_TOP_WORDS=15)



0 medical, laboratory, health, care, hours, services, clinical, patient, lab, assistant, training, duties, healthcare, patients, procedures
1 legal, law, counsel, matters, corporate, clients, attorney, including, company, regulatory, litigation, issues, firm, state, provide
2 financial, compliance, credit, finance, accounting, risk, analyst, analysis, strong, audit, information, internal, degree, commercial, services
3 en, la, con, el, experiencia, pruebas, para, trabajo, que, las, por, anos, los, al, del
4 systems, northrop, grumman, eeo, qualifications, engineering, title, diverse, positions, system, complete, requisition, hiring, id, workforce
5 robert, half, apply, applicants, please, applying, openings, authorized, compensation, candidates, time, office, opportunity, companies, contact
6 process, related, ensure, procedures, project, support, including, activities, technical, provide, processes, documentation, perform, reports, assurance
7 status, engineering, gender, protected, n

In [38]:
stemmer = PorterStemmer()

print(stemmer.stem('lies'))
print(stemmer.stem('lying'))
print(stemmer.stem('systematic'))
print(stemmer.stem('running'))

lie
lie
systemat
run


In [39]:
processed_bag_of_words, processed_features = create_bag_of_words(processed_corpus, stem = True, stop_words=biskut_stopwords)

processed_keywords, processed_doctopics = create_topics(processed_bag_of_words, processed_features, N_TOPICS=10, N_TOP_WORDS=15)



0 manag, requir, qualiti, complianc, experi, process, abil, review, ensur, product, skill, document, report, plan, respons
1 sale, busi, market, partner, custom, microsoft, manag, product, skill, develop, career, opportun, provid, commun, respons
2 en, la, con, el, experiencia, prueba, para, ano, lo, trabajo, que, al, conocimiento, por, empresa
3 statu, experi, employ, system, engin, protect, gender, nation, race, religion, disabl, origin, develop, requir, color
4 legal, appli, compani, thi, half, robert, state, applic, experi, provid, opportun, law, author, open, offic
5 develop, softwar, experi, engin, design, technolog, product, autom, skill, system, c, comput, tool, use, data
6 posit, requir, perform, assist, duti, time, thi, abil, applic, may, depart, skill, student, research, includ
7 busi, manag, experi, legal, credit, client, servic, compani, abil, financi, profession, includ, us, account, law
8 medic, patient, provid, servic, care, health, clinic, requir, laboratori, healthcar

In [43]:
processed_bag_of_words, processed_features = create_bag_of_words(processed_corpus, NGRAM_RANGE=(1,2), stop_words=biskut_stopwords, stem = True, USE_IDF=True)

In [44]:
processed_word_counts = get_word_count(processed_bag_of_words, processed_features)
processed_word_counts

OrderedDict([('experi', 7311.341695219634),
             ('requir', 6500.896622374151),
             ('develop', 6488.091439132843),
             ('skill', 5593.645320231913),
             ('engin', 5455.956868258732),
             ('manag', 5401.733751635277),
             ('softwar', 5263.674920537076),
             ('system', 5223.212599421202),
             ('provid', 5179.3108430063685),
             ('thi', 5025.063768177935),
             ('servic', 4995.365763968441),
             ('abil', 4949.301292103332),
             ('includ', 4947.70827018894),
             ('applic', 4895.582017666042),
             ('posit', 4797.897371564051),
             ('product', 4666.3367421137855),
             ('busi', 4664.872244113505),
             ('perform', 4626.062694094865),
             ('employ', 4591.426951549739),
             ('support', 4565.668388020691),
             ('year', 4520.517145934068),
             ('opportun', 4509.715783060004),
             ('respons', 4509.5246722

In [46]:
processed_keywords, processed_doctopics = create_topics(processed_bag_of_words, processed_features, N_TOPICS=10, N_TOP_WORDS=15)



0 legal, attorney, law, counsel, litig, law firm, matter, draft, firm, legal depart, corpor, negoti, licens, agreement, bar
1 softwar, engin, develop, design, autom, technolog, experi, tool, softwar engin, softwar develop, c, product, system, comput, qualiti
2 en, la, con, experiencia, el, ano, prueba, trabajo, para, al, por, conocimiento, que, empresa, lo
3 grumman, northrop, northrop grumman, eeo, system, engin, u citizenship, clearanc, aa, softwar, divers workforc, citizenship requir, eeo aa, commit hire, protect class
4 offer, career, benefit, health, servic, www, healthcar, reward, opportun, medic, provid, competit, employe, life, profession
5 manag, busi, complianc, abil, risk, project, intern, knowledg, ensur, respons, skill, develop, product, experi, review
6 robert, robert half, half, open author, applic appli, appli, author, compens, open, f disabl, admir, admir compani, magazin, world admir, staf
7 laboratori, patient, duti, clinic, assist, medic, hour, lab, requir, posit, h

In [66]:
ls_topic_id = [np.argsort(processed_doctopics[comment_id])[::-1][0] for comment_id in range(len(corpus))]
df['topic_id'] = ls_topic_id

In [68]:
topic_num = 0
print(processed_keywords[topic_num])
df[df.topic_id == topic_num].head(100)

legal, attorney, law, counsel, litig, law firm, matter, draft, firm, legal depart, corpor, negoti, licens, agreement, bar


Unnamed: 0,normalizedTitle_onetName,normalizedTitle_onetCode,jobDescription,title,topic_id
89,Lawyers,23-1011.00,* Advance your Legal career at Liberty Mutual ...,Field Counsel,0
90,Lawyers,23-1011.00,* Advance your Legal career at Liberty Mutual ...,Field Counsel,0
91,Lawyers,23-1011.00,* Advance your Legal career at Liberty Mutual ...,Field Counsel,0
92,Lawyers,23-1011.00,* Advance your Legal career at Liberty Mutual ...,Field Counsel,0
100,Lawyers,23-1011.00,* Advance your Legal career at Liberty Mutual ...,Field Counsel,0
105,Lawyers,23-1011.00,* Advance your Legal career at Liberty Mutual ...,Field Counsel,0
106,Lawyers,23-1011.00,* Advance your Legal career at Liberty Mutual ...,Field Counsel,0
107,Lawyers,23-1011.00,* Advance your Legal career at Liberty Mutual ...,Field Counsel,0
111,Lawyers,23-1011.00,* Advance your Legal career at Liberty Mutual ...,"Civil Litigation Attorney, Field Counsel",0
112,Lawyers,23-1011.00,* Advance your Legal career at Liberty Mutual ...,Field Counsel,0


In [71]:
df_train = pd.read_csv("train_corpus_document_tagging.csv")
df_test = pd.read_csv("test_corpus_document_tagging.csv")

In [72]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,normalizedTitle_onetName,jobDescription
0,96011,Credit Analysts,/Join a financial group thats as committed to ...
1,355594,Credit Analysts,"for fastest consideration on this position, pl..."
2,118598,Credit Analysts,As a Credit Analyst youll be responsible for c...
3,107794,Credit Analysts,We are a specialty retailer offering the very ...
4,40010,Credit Analysts,Title: KYC Remediation Team Analyst II Locatio...


In [74]:
df_train['normalizedTitle_onetName'].unique()

array(['Credit Analysts', 'Financial Examiners'], dtype=object)

In [75]:
df_test['normalizedTitle_onetName'].unique()

array(['Credit Analysts', 'Financial Examiners'], dtype=object)

In [78]:
train_labels = df_train.normalizedTitle_onetName.values
train_corpus = np.array([re.sub(REGEX_THAT, ' ', comment).lower() for comment in df_train.jobDescription.values])
test_labels = df_test.normalizedTitle_onetName.values
test_corpus = np.array([re.sub(REGEX_THAT, ' ', comment).lower() for comment in df_test.jobDescription.values])

array(['Credit Analysts', 'Credit Analysts', 'Credit Analysts',
       'Credit Analysts', 'Credit Analysts', 'Credit Analysts',
       'Credit Analysts', 'Credit Analysts', 'Credit Analysts',
       'Credit Analysts', 'Credit Analysts', 'Credit Analysts',
       'Credit Analysts', 'Credit Analysts', 'Credit Analysts',
       'Credit Analysts', 'Credit Analysts', 'Credit Analysts',
       'Credit Analysts', 'Credit Analysts', 'Credit Analysts',
       'Credit Analysts', 'Credit Analysts', 'Credit Analysts',
       'Credit Analysts', 'Credit Analysts', 'Credit Analysts',
       'Credit Analysts', 'Credit Analysts', 'Credit Analysts',
       'Credit Analysts', 'Credit Analysts', 'Credit Analysts',
       'Credit Analysts', 'Credit Analysts', 'Credit Analysts',
       'Credit Analysts', 'Credit Analysts', 'Credit Analysts',
       'Credit Analysts', 'Credit Analysts', 'Credit Analysts',
       'Credit Analysts', 'Credit Analysts', 'Credit Analysts',
       'Credit Analysts', 'Credit Analys