In [1]:
import string
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
"""
We use an n-gram vectorizer to collect significant words and phrases
from the job responsibilities section.

See scikit-learn.org/stable/modules/feature_extraction.html and 
scikit-learn.org/stable/modules/classes.html#text-feature-extraction-ref 
for TfidfVectorizer parameters"""

data = pd.read_csv('job_skill_short.csv', dtype = str)
# clean 15 empty lines, replace null entries with space
corpus = data['Responsibilities'].fillna(" ")

#replacing punctuation
corpus = corpus.str.replace('\'',' ')
corpus = corpus.str.replace("-",' ')
corpus = corpus.str.replace(",",'')
corpus = corpus.str.replace("\\n",'. ')
corpus = corpus.str.replace('[^\w\s]','.')
corpus = corpus.str.replace("\..",'.')

st = PorterStemmer()
corpus = corpus.apply(lambda x: " ".join([st.stem(word) for word in x.split()]))#stem words

""" tokenize all words 3 letters or longer.  
    Ignore 3-grams that occur in more than 60% 
    or in less than 1% of descriptions
"""
vectorizer = CountVectorizer(stop_words = 'english', strip_accents=ascii, ngram_range=(1, 3), analyzer = 'word',max_df=0.6, min_df=.01)
bow = np.array(vectorizer.fit_transform(corpus).toarray())
#print("extracted " + str(len(vectorizer.get_feature_names())) + " features from " + str(len(corpus)) + " documents\n")
#print(vectorizer.get_feature_names())
#print(bow)

In [3]:
#print(bow)

#print(type(bow))
#print(bow.shape)
#print(len(vectorizer.get_feature_names()))

dtm = pd.DataFrame(bow, columns=vectorizer.get_feature_names())
new_data = pd.concat([data, dtm], axis=1)
#print(new_data)
new_data.to_csv("dtm_jobs.csv", header= True)

In [4]:
# extract minimum degree qualifications
phd = []
babs = []
mstr = []
dct = []
mba = []
asc = []
for a in data['Minimum.Qualifications']:
    if a is None:
        a = ""
    if (str(a).lower().find("phd") != -1):
        phd.append(1)
    else:
        phd.append(0)
    if (str(a).lower().find("ba/bs") != -1 or str(a).lower().find("bachelor") != -1):
        babs.append(1)
    else:
        babs.append(0)
    if (str(a).lower().find("ms") != -1 or str(a).lower().find("master") != -1):
        mstr.append(1)
    else:
        mstr.append(0)
    if (str(a).lower().find("juris") != -1):
        dct.append(1)
    else:
        dct.append(0)
    if (str(a).lower().find("mba") != -1):
        mba.append(1)
    else:
        mba.append(0)
    if (str(a).lower().find("aa") != -1 or str(a).lower().find("as") != -1):
        asc.append(1)
    else:
        asc.append(0)

# print("PhDs = " + str(sum(phd[1:])))
# print("BA/BSs = " + str(sum(babs[1:])))
# print("Masters = " + str(sum(mstr[1:])))
# print("Lawyers = " + str(sum(dct[1:])))
# print("MBAs = " + str(sum(mba[1:])))
# print("Associates = " + str(sum(asc[1:])))

In [5]:
# append minimum degree qualifications to data set
minquals = {'min phd' : pd.Series(phd),
              'min ba/bs' : pd.Series(babs),
              'min master' : pd.Series(mstr),
              'min juris' : pd.Series(dct),
              'min mba' : pd.Series(mba),
              'min associate' : pd.Series(asc)}
dtm = pd.DataFrame(minquals)
new_data = pd.concat([new_data, dtm], axis=1)

In [6]:
# extract preferred degree qualifications
phd = []
babs = []
mstr = []
dct = []
mba = []
asc = []
for a in data['Preferred.Qualifications']:
    if a is None:
        a = ""
    if (str(a).lower().find("phd") != -1):
        phd.append(1)
    else:
        phd.append(0)
    if (str(a).lower().find("ba/bs") != -1 or str(a).lower().find("bachelor") != -1):
        babs.append(1)
    else:
        babs.append(0)
    if (str(a).lower().find("ms") != -1 or str(a).lower().find("master") != -1):
        mstr.append(1)
    else:
        mstr.append(0)
    if (str(a).lower().find("juris") != -1):
        dct.append(1)
    else:
        dct.append(0)
    if (str(a).lower().find("mba") != -1):
        mba.append(1)
    else:
        mba.append(0)
    if (str(a).lower().find("aa") != -1 or str(a).lower().find("as") != -1):
        asc.append(1)
    else:
        asc.append(0)

# print("pref PhDs = " + str(sum(phd[1:])))
# print("pref BA/BSs = " + str(sum(babs[1:])))
# print("pref Masters = " + str(sum(mstr[1:])))
# print("pref Lawyers = " + str(sum(dct[1:])))
# print("pref MBAs = " + str(sum(mba[1:])))
# print("pref Associates = " + str(sum(asc[1:])))

In [7]:
# append preferred degree qualifications to data set
prefquals = {'pref phd' : pd.Series(phd),
              'pref ba/bs' : pd.Series(babs),
              'pref master' : pd.Series(mstr),
              'pref juris' : pd.Series(dct),
              'pref mba' : pd.Series(mba),
              'pref associate' : pd.Series(asc)}
dtm = pd.DataFrame(prefquals)
new_data = pd.concat([new_data, dtm], axis=1)
new_data.to_csv("dtm_jobs_v2.csv", header= True)