In [43]:
import pandas as pd
df = pd.read_csv('outputs/data_science_jobs.csv')

In [44]:
df = df.drop(df.columns[0], axis = 1) 

In [45]:
#Extracting Job Descriptions
desc_df = df['Job Description'].to_frame()

In [46]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /Users/olohireme/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/olohireme/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [47]:
import contractions

In [48]:
desc_df['Job Description'] = desc_df['Job Description'].apply(lambda x: [contractions.fix(word) for word in x.split()])


In [49]:
desc_df['Job Description'] = [' '.join(map(str, l)) for l in desc_df['Job Description']]
desc_df['Job Description'] = desc_df['Job Description'].str.lower()



In [50]:
desc_df['tokenized_desc'] = desc_df['Job Description'].apply(word_tokenize)


In [51]:
# initializing Stop words libraries
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/olohireme/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [52]:
#remove stopwords 
desc_df['tokenized_desc'] = desc_df['tokenized_desc'].apply(lambda x: [word for word in x if word not in stop_words])

In [53]:
desc_df.sample(10, random_state = 5)

Unnamed: 0,Job Description,tokenized_desc
852,for over 25 years we have been helping our com...,"[25, years, helping, community, essential, oil..."
207,"about the job individually we are people, but ...","[job, individually, people, ,, together, aviva..."
627,job type: permanent primary location: vancouve...,"[job, type, :, permanent, primary, location, :..."
671,our subsidiary is in proptech and real estate ...,"[subsidiary, proptech, real, estate, analytics..."
661,who needs insurance? everybody. that keeps us ...,"[needs, insurance, ?, everybody, ., keeps, us,..."
489,"avant le sras-cov-2, un vaccin qui était dével...","[avant, le, sras-cov-2, ,, un, vaccin, qui, ét..."
434,responsibilities: develop state-of-the-art com...,"[responsibilities, :, develop, state-of-the-ar..."
448,qui sommes-nous : buspatrouille est une entrep...,"[qui, sommes-nous, :, buspatrouille, est, une,..."
634,"for 70 years, charles river employees have wor...","[70, years, ,, charles, river, employees, work..."
748,we are transforming the cyber security industr...,"[transforming, cyber, security, industry, us, ..."


In [54]:
def pos_series(keyword):
    '''categorizes after tokenizing words with POS tags'''
    #tokens = nltk.word_tokenize(keyword)
    tagged = nltk.pos_tag(keyword)
    return tagged

In [55]:
%%time
pos_tagged_arrs = desc_df['tokenized_desc'].apply(pos_series)


CPU times: user 11.1 s, sys: 82.1 ms, total: 11.2 s
Wall time: 11.2 s


In [56]:
# unloads the tuples from the tree object for easier manipulation
pos_tagged = []
for row in pos_tagged_arrs.values:
    for element in row:
        pos_tagged.append(element)

In [57]:
# dataframe contains all of the words with their corresponding pos tag;
pos_df = pd.DataFrame(pos_tagged, columns = ('word','POS'))
# special chars were removed due to irrelevance as a tag but will be included in regex
char_removal = [',', '.', ':', '#', '$', '\'\'', '``', '(', ')']
drop_indices = (pos_df.loc[pos_df.POS.isin(char_removal)].index)
pos_df.drop(drop_indices, inplace = True)

In [58]:
grammar1 = ('''Noun Phrases: {<DT>?<JJ>*<NN|NNS|NNP>+}''')
chunkParser = nltk.RegexpParser(grammar1)
tree1 = chunkParser.parse(pos_tagged)

In [59]:
# typical noun phrase pattern to be pickled for later analyses
g1_chunks = []
for subtree in tree1.subtrees(filter=lambda t: t.label() == 'Noun Phrases'):
    # print(subtree)
    g1_chunks.append(subtree)

In [60]:
g1_chunks[:20]

[Tree('Noun Phrases', [('metricsflow', 'NN')]),
 Tree('Noun Phrases', [('work', 'NN'), ('shape', 'NN')]),
 Tree('Noun Phrases', [('future', 'JJ'), ('data', 'NNS'), ('attribution', 'NN')]),
 Tree('Noun Phrases', [('new', 'JJ'), ('kind', 'NN'), ('relationship', 'NN'), ('customers', 'NNS'), ('companies', 'NNS')]),
 Tree('Noun Phrases', [('privacy-first', 'JJ'), ('practices', 'NNS'), ('smarter', 'NN')]),
 Tree('Noun Phrases', [('non-invasive', 'JJ'), ('customer', 'NN'), ('learning', 'NN')]),
 Tree('Noun Phrases', [('innovates', 'NNS')]),
 Tree('Noun Phrases', [('machine', 'NN')]),
 Tree('Noun Phrases', [('ai', 'JJ'), ('technology', 'NN'), ('equip', 'NN'), ('businesses', 'NNS')]),
 Tree('Noun Phrases', [('right', 'JJ'), ('tools', 'NNS')]),
 Tree('Noun Phrases', [('passionate', 'JJ'), ('internet', 'NN'), ('data', 'NNS')]),
 Tree('Noun Phrases', [('right', 'JJ'), ('place', 'NN')]),
 Tree('Noun Phrases', [('exceptional', 'JJ'), ('individual', 'JJ'), ('serve', 'NN'), ('data', 'NNS'), ('scientis

In [61]:
#saving the scaler to apply it on the test dataset
import pickle
with open('pickles/chunks_1.pickle', 'wb') as fp1:
    pickle.dump(g1_chunks, fp1)

In [62]:
#Noun phrase variation 
# preposition maybe, any number of adjective or nouns, any plural nouns or singular nouns
grammar2 = ('''NP2: {<IN>?<JJ|NN>*<NNS|NN>} ''')
chunkParser = nltk.RegexpParser(grammar2)
tree2 = chunkParser.parse(pos_tagged)

In [63]:
# variation of a noun phrase pattern to be pickled for later analyses
g2_chunks = []
for subtree in tree2.subtrees(filter=lambda t: t.label() == 'NP2'):
    # print(subtree)
    g2_chunks.append(subtree)

In [64]:
g2_chunks[:20]

[Tree('NP2', [('metricsflow', 'NN')]),
 Tree('NP2', [('work', 'NN'), ('shape', 'NN'), ('future', 'JJ'), ('data', 'NNS')]),
 Tree('NP2', [('attribution', 'NN'), ('new', 'JJ'), ('kind', 'NN'), ('relationship', 'NN'), ('customers', 'NNS')]),
 Tree('NP2', [('companies', 'NNS')]),
 Tree('NP2', [('privacy-first', 'JJ'), ('practices', 'NNS')]),
 Tree('NP2', [('smarter', 'NN')]),
 Tree('NP2', [('non-invasive', 'JJ'), ('customer', 'NN'), ('learning', 'NN')]),
 Tree('NP2', [('innovates', 'NNS')]),
 Tree('NP2', [('machine', 'NN')]),
 Tree('NP2', [('ai', 'JJ'), ('technology', 'NN'), ('equip', 'NN'), ('businesses', 'NNS')]),
 Tree('NP2', [('right', 'JJ'), ('tools', 'NNS')]),
 Tree('NP2', [('passionate', 'JJ'), ('internet', 'NN'), ('data', 'NNS')]),
 Tree('NP2', [('right', 'JJ'), ('place', 'NN')]),
 Tree('NP2', [('exceptional', 'JJ'), ('individual', 'JJ'), ('serve', 'NN'), ('data', 'NNS')]),
 Tree('NP2', [('scientist', 'NN')]),
 Tree('NP2', [('successful', 'JJ'), ('candidate', 'NN'), ('work', 'NN')]

In [65]:
with open('pickles/chunks_2.pickle', 'wb') as fp2:
    pickle.dump(g2_chunks , fp2)

In [66]:
# any sort of verb followed by any number of nouns
grammar3 = ('''
    VS: {<VBG|VBZ|VBP|VBD|VB|VBN><NNS|NN>*}
    ''')
chunkParser = nltk.RegexpParser(grammar3)
tree3 = chunkParser.parse(pos_tagged)

In [67]:
# verb-noun pattern to be pickled for later analyses
g3_chunks = []
for subtree in tree3.subtrees(filter=lambda t: t.label() == 'VS'):
    # print(subtree)
    g3_chunks.append(subtree)

In [68]:
g3_chunks[:20]

[Tree('VS', [('advocating', 'VBG')]),
 Tree('VS', [('metricsflow', 'VB'), ('innovates', 'NNS')]),
 Tree('VS', [('using', 'VBG'), ('machine', 'NN')]),
 Tree('VS', [('learning', 'VBG')]),
 Tree('VS', [('treated', 'VBD')]),
 Tree('VS', [('found', 'VBD')]),
 Tree('VS', [('metricsflow', 'VB')]),
 Tree('VS', [('seeking', 'VBG')]),
 Tree('VS', [('ensure', 'VB')]),
 Tree('VS', [('structured', 'VBD')]),
 Tree('VS', [('develop', 'VBP')]),
 Tree('VS', [('uncover', 'VBP')]),
 Tree('VS', [('drive', 'VBP'), ('projects', 'NNS')]),
 Tree('VS', [('end', 'VBP'), ('end', 'NN')]),
 Tree('VS', [('including', 'VBG')]),
 Tree('VS', [('building', 'VBG'), ('data', 'NNS'), ('pipelines', 'NNS')]),
 Tree('VS', [('integrating', 'VBG'), ('engineering', 'NN'), ('systems', 'NNS'), ('research', 'NN'), ('industry', 'NN')]),
 Tree('VS', [('implemented', 'VBD')]),
 Tree('VS', [('focus', 'VB'), ('data', 'NNS'), ('science', 'NN'), ('features', 'NNS')]),
 Tree('VS', [('including', 'VBG'), ('data', 'NNS'), ('preparation', 'N

In [69]:
with open('pickles/chunks_3.pickle', 'wb') as fp3:
    pickle.dump(g3_chunks, fp3)

In [70]:
# any number of a singular or plural noun followed by a comma followed by the same noun, noun, noun pattern
grammar4 = ('''
    Commas: {<NN|NNS>*<,><NN|NNS>*<,><NN|NNS>*} 
    ''')
chunkParser = nltk.RegexpParser(grammar4)
tree4 = chunkParser.parse(pos_tagged)

In [71]:
# common pattern of listing skills to be pickled for later analyses
g4_chunks = []
for subtree in tree4.subtrees(filter=lambda t: t.label() == 'Commas'):
    # print(subtree)
    g4_chunks.append(subtree)

In [72]:
g4_chunks[:20]

[Tree('Commas', [('models', 'NNS'), (',', ','), ('metrics', 'NNS'), (',', ','), ('reports', 'NNS')]),
 Tree('Commas', [('data', 'NNS'), ('preparation', 'NN'), (',', ','), ('machine', 'NN'), ('learning', 'NN'), (',', ',')]),
 Tree('Commas', [('networks', 'NNS'), (',', ','), ('time-series', 'NNS'), ('analysis', 'NN'), (',', ',')]),
 Tree('Commas', [('quality', 'NN'), (',', ','), ('safety', 'NN'), (',', ','), ('reliability', 'NN')]),
 Tree('Commas', [(',', ','), ('deploy', 'NN'), (',', ','), ('maintain', 'NN')]),
 Tree('Commas', [('production', 'NN'), (',', ','), ('maintenance', 'NN'), (',', ',')]),
 Tree('Commas', [('years', 'NNS'), ('software', 'NN'), ('development', 'NN'), ('experience', 'NN'), ('experience', 'NN'), ('keras', 'NNS'), (',', ','), ('tensorflow', 'NN'), (',', ','), ('sklearn', 'NN')]),
 Tree('Commas', [('priorities', 'NNS'), (',', ','), ('requirements', 'NNS'), (',', ','), ('schedules', 'NNS')]),
 Tree('Commas', [('data', 'NNS'), ('operations', 'NNS'), ('team', 'NN'), (',

In [73]:
with open('pickles/chunks_4.pickle', 'wb') as fp4:
    pickle.dump(g4_chunks, fp4)

In [74]:
# loaded at the bottom of this page to inspect as needed
chunks1 = pickle.load( open('pickles/chunks_1.pickle', "rb" ) )
chunks2 = pickle.load( open('pickles/chunks_2.pickle', "rb" ) )
chunks3 = pickle.load( open('pickles/chunks_3.pickle', "rb" ) )
chunks4 = pickle.load( open('pickles/chunks_4.pickle', "rb" ) )

In [75]:
print('Length:', len(chunks1), 'Sample Size:', len(chunks1) * .10)
print('Length:', len(chunks2), 'Sample Size:', len(chunks2) * .10) 
print('Length:', len(chunks3), 'Sample Size:', len(chunks3) * .10)
print('Length:', len(chunks4), 'Sample Size:', len(chunks4) * .10)

Length: 125438 Sample Size: 12543.800000000001
Length: 136372 Sample Size: 13637.2
Length: 59342 Sample Size: 5934.200000000001
Length: 8099 Sample Size: 809.9000000000001
