In [1]:
import string
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
"""
We use an n-gram vectorizer to collect significant words and phrases
from the job responsibilities section.

See scikit-learn.org/stable/modules/feature_extraction.html and 
scikit-learn.org/stable/modules/classes.html#text-feature-extraction-ref 
for TfidfVectorizer parameters"""

data = pd.read_csv('job_skill_short.csv', dtype = str)
# clean 15 empty lines, replace null entries with space
corpus = data['Responsibilities'].fillna(" ")

#replacing punctuation
corpus = corpus.str.replace('\'',' ')
corpus = corpus.str.replace("-",' ')
corpus = corpus.str.replace(",",'')
corpus = corpus.str.replace("\\n",'. ')
corpus = corpus.str.replace('[^\w\s]','.')
corpus = corpus.str.replace("\..",'.')

st = PorterStemmer()
corpus = corpus.apply(lambda x: " ".join([st.stem(word) for word in x.split()]))#stem words

""" tokenize all words 3 letters or longer.  
    Ignore 3-grams that occur in more than 60% 
    or in less than 1% of descriptions
"""
vectorizer = CountVectorizer(stop_words = 'english', strip_accents=ascii, ngram_range=(1, 3), analyzer = 'word',max_df=0.6, min_df=.01)
bow = np.array(vectorizer.fit_transform(corpus).toarray())
#print("extracted " + str(len(vectorizer.get_feature_names())) + " features from " + str(len(corpus)) + " documents\n")
#print(vectorizer.get_feature_names())
#print(bow)

In [3]:
#print(bow)

#print(type(bow))
#print(bow.shape)
#print(len(vectorizer.get_feature_names()))

dtm = pd.DataFrame(bow, columns=vectorizer.get_feature_names())
new_data = pd.concat([data, dtm], axis=1)
#print(new_data)
new_data.to_csv("dtm_jobs.csv", header= True)

In [4]:
corpus_null = data['Minimum.Qualifications']
corpus= corpus_null.fillna(" ") # clean 15 empty lines, replace null entries with space

#replacing punctuation
corpus = corpus.str.replace('\'',' ')
corpus = corpus.str.replace("-",' ')
corpus = corpus.str.replace(",",'')
corpus = corpus.str.replace("\\n",'. ')
corpus = corpus.str.replace('[^\w\s]','.')
corpus = corpus.str.replace("\..",'.')


#corpus = corpus.str.replace('.','')

st = PorterStemmer()
corpus = corpus.apply(lambda x: " ".join([st.stem(word) for word in x.split()]))#stem words

# tokenize all words 3 letters or longer.  Ignore 3-grams that occur in more than 727 or in less than 3 descriptions
vectorizer = CountVectorizer(stop_words = 'english', strip_accents=ascii, ngram_range=(3,3), analyzer = 'word',max_df=0.6, min_df=.005)
bow = np.array(vectorizer.fit_transform(corpus).toarray())
#print("extracted " + str(len(vectorizer.get_feature_names())) + " features from " + str(len(corpus)) + " documents\n")
#print(vectorizer.get_feature_names())

extracted 608 features from 1227 documents

['10 week month', '10 week program', '10 year experi', '10 year relev', '11 12 week', '11 week program', '12 week program', '12 year experi', '12 year relev', '2018 2019 2020', '2018 2019 pleas', '2018 abil speak', '2018 author work', '2018 decemb 2018', '2018 juli 2019', '2018 june 2019', '2019 2020 current', '2019 2020 design', '2019 abl complet', '2019 author work', '2019 avail intern', '2019 pleas includ', '2019 return educ', '2020 current enrol', '2020 design portfolio', '40 time requir', 'aas infrastructur servic', 'aas markets experi', 'aas platform servic', 'abil commit minimum', 'abil speak write', 'abil travel 40', 'abil travel 50', 'abl complet 11', 'abl complet minimum', 'abl speak write', 'account equival practic', 'administr ba program', 'advertis sale market', 'agenc corpor setting', 'agenc hous corpor', 'agil partner mid', 'aipei offic septemb', 'akarta offic septemb', 'anagement sale relationship', 'ani major univers', 'anthr

In [5]:
corpus_null = data['Preferred.Qualifications']
corpus= corpus_null.fillna(" ") # clean 15 empty lines, replace null entries with space

#replacing punctuation
corpus = corpus.str.replace('\'',' ')
corpus = corpus.str.replace("-",' ')
corpus = corpus.str.replace(",",'')
corpus = corpus.str.replace("\\n",'. ')
corpus = corpus.str.replace('[^\w\s]','.')
corpus = corpus.str.replace("\..",'.')


#corpus = corpus.str.replace('.','')

st = PorterStemmer()
corpus = corpus.apply(lambda x: " ".join([st.stem(word) for word in x.split()]))#stem words

# tokenize all words 3 letters or longer.  Ignore 3-grams that occur in more than 727 or in less than 3 descriptions
vectorizer = CountVectorizer(stop_words = 'english', strip_accents=ascii, ngram_range=(3,3), analyzer = 'word',max_df=0.6, min_df=.005)
bow = np.array(vectorizer.fit_transform(corpus).toarray())
#print("extracted " + str(len(vectorizer.get_feature_names())) + " features from " + str(len(corpus)) + " documents\n")
#print(vectorizer.get_feature_names())

extracted 1383 features from 1227 documents

['10 year experi', '10 year partner', '10 year relev', '2018 2019 2020', '2019 2020 demonstr', '2019 2020 relev', '2020 demonstr profici', '2020 relev work', 'aa iaas work', 'aas aas experi', 'aas aas technologies', 'aas experi work', 'aas infrastructur servic', 'aas technologies experi', 'abil adapt messag', 'abil analyz complex', 'abil appli brand', 'abil build influenti', 'abil build relationship', 'abil build strong', 'abil collabor build', 'abil comfort interact', 'abil commun complex', 'abil commun effect', 'abil craft compel', 'abil deal ambigu', 'abil deleg motiv', 'abil deliv high', 'abil drive implement', 'abil effect convers', 'abil effect influenc', 'abil effect oper', 'abil engag develop', 'abil engag senior', 'abil handl custom', 'abil handl highli', 'abil initi build', 'abil lead handl', 'abil lead manag', 'abil manag multipl', 'abil manag numer', 'abil meet exceed', 'abil multi task', 'abil navig ambigu', 'abil oper fast', 'a

In [6]:
corpus_null = data['Title']
corpus= corpus_null.fillna(" ") # clean 15 empty lines, replace null entries with space

#replacing punctuation
corpus = corpus.str.replace('\'',' ')
corpus = corpus.str.replace("-",' ')
corpus = corpus.str.replace(",",'')
corpus = corpus.str.replace("\\n",'. ')
corpus = corpus.str.replace('[^\w\s]','.')
corpus = corpus.str.replace("\..",'.')


#corpus = corpus.str.replace('.','')

st = PorterStemmer()
corpus = corpus.apply(lambda x: " ".join([st.stem(word) for word in x.split()]))#stem words

# tokenize all words 3 letters or longer.  Ignore 2-grams that occur in more than 727 or in less than 3 descriptions
vectorizer = CountVectorizer(stop_words = 'english', strip_accents=ascii, ngram_range=(2,2), analyzer = 'word',max_df=0.6, min_df=.0025)
bow = np.array(vectorizer.fit_transform(corpus).toarray())
#print("extracted " + str(len(vectorizer.get_feature_names())) + " features from " + str(len(corpus)) + " documents\n")
#print(vectorizer.get_feature_names())

extracted 273 features from 1227 documents

['2018 argentina', '2018 brazil', '2018 colombia', '2018 mexico', 'account execut', 'account lead', 'account manag', 'account strategist', 'ach integr', 'ad commerc', 'ad googl', 'administr busi', 'analyst consum', 'analyst data', 'analyst googl', 'analyst market', 'analyst product', 'analyst trust', 'analyt lead', 'analyt manag', 'android partner', 'architect googl', 'architect healthcar', 'art time', 'associ account', 'associ legal', 'attribut specialist', 'bilingual 2018', 'bold intern', 'brand studio', 'busi analyst', 'busi develop', 'busi intern', 'busi manag', 'busi partner', 'busi strategi', 'buy solut', 'chain manag', 'channel sale', 'channel specialist', 'cloud ach', 'cloud engin', 'cloud mea', 'cloud nglish', 'cloud platform', 'cloud profession', 'cloud support', 'cluster manag', 'commun googl', 'commun manag', 'commun program', 'consult ad', 'consult android', 'consult googl', 'consum hardwar', 'counsel commerci', 'custom develop',

In [7]:
# extract minimum degree qualifications
phd = []
babs = []
mstr = []
dct = []
mba = []
asc = []
for a in data['Minimum.Qualifications']:
    if a is None:
        a = ""
    if (str(a).lower().find("phd") != -1):
        phd.append(1)
    else:
        phd.append(0)
    if (str(a).lower().find("ba/bs") != -1 or str(a).lower().find("bachelor") != -1):
        babs.append(1)
    else:
        babs.append(0)
    if (str(a).lower().find("ms") != -1 or str(a).lower().find("master") != -1):
        mstr.append(1)
    else:
        mstr.append(0)
    if (str(a).lower().find("juris") != -1):
        dct.append(1)
    else:
        dct.append(0)
    if (str(a).lower().find("mba") != -1):
        mba.append(1)
    else:
        mba.append(0)
    if (str(a).lower().find("aa") != -1 or str(a).lower().find("as") != -1):
        asc.append(1)
    else:
        asc.append(0)

# print("PhDs = " + str(sum(phd[1:])))
# print("BA/BSs = " + str(sum(babs[1:])))
# print("Masters = " + str(sum(mstr[1:])))
# print("Lawyers = " + str(sum(dct[1:])))
# print("MBAs = " + str(sum(mba[1:])))
# print("Associates = " + str(sum(asc[1:])))

PhDs = 8
BA/BSs = 978
Masters = 308
Lawyers = 6
MBAs = 71
Associates = 440


In [8]:
# append minimum degree qualifications to data set
minquals = {'min phd' : pd.Series(phd),
              'min ba/bs' : pd.Series(babs),
              'min master' : pd.Series(mstr),
              'min juris' : pd.Series(dct),
              'min mba' : pd.Series(mba),
              'min associate' : pd.Series(asc)}
dtm = pd.DataFrame(minquals)
new_data = pd.concat([new_data, dtm], axis=1)

In [9]:
# extract preferred degree qualifications
phd = []
babs = []
mstr = []
dct = []
mba = []
asc = []
for a in data['Preferred.Qualifications']:
    if a is None:
        a = ""
    if (str(a).lower().find("phd") != -1):
        phd.append(1)
    else:
        phd.append(0)
    if (str(a).lower().find("ba/bs") != -1 or str(a).lower().find("bachelor") != -1):
        babs.append(1)
    else:
        babs.append(0)
    if (str(a).lower().find("ms") != -1 or str(a).lower().find("master") != -1):
        mstr.append(1)
    else:
        mstr.append(0)
    if (str(a).lower().find("juris") != -1):
        dct.append(1)
    else:
        dct.append(0)
    if (str(a).lower().find("mba") != -1):
        mba.append(1)
    else:
        mba.append(0)
    if (str(a).lower().find("aa") != -1 or str(a).lower().find("as") != -1):
        asc.append(1)
    else:
        asc.append(0)

# print("pref PhDs = " + str(sum(phd[1:])))
# print("pref BA/BSs = " + str(sum(babs[1:])))
# print("pref Masters = " + str(sum(mstr[1:])))
# print("pref Lawyers = " + str(sum(dct[1:])))
# print("pref MBAs = " + str(sum(mba[1:])))
# print("pref Associates = " + str(sum(asc[1:])))

pref PhDs = 66
pref BA/BSs = 22
pref Masters = 671
pref Lawyers = 11
pref MBAs = 199
pref Associates = 872


In [10]:
# append preferred degree qualifications to data set
prefquals = {'pref phd' : pd.Series(phd),
              'pref ba/bs' : pd.Series(babs),
              'pref master' : pd.Series(mstr),
              'pref juris' : pd.Series(dct),
              'pref mba' : pd.Series(mba),
              'pref associate' : pd.Series(asc)}
dtm = pd.DataFrame(prefquals)
new_data = pd.concat([new_data, dtm], axis=1)
new_data.to_csv("dtm_jobs_v2.csv", header= True)