In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models.phrases import Phrases
from gensim.models.phrases import Phraser
from nltk import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import matplotlib.pyplot as plt
%matplotlib inline

# A function to extract the city name from the string giving the location
def get_job_city(job_loc):
    job_loc_split = str(job_loc).split(",")
    return(job_loc_split[0])

# A function to extract the state from the string giving the location
def get_job_state(job_loc):
    job_loc_split = str(job_loc).split(",")
    if len(job_loc_split) > 1:
        return(str(job_loc_split[1]).split()[0])
    else:
        return("")
    
# A class to fit a first-order phrase model to a series of job titles
class PhraseBigram(BaseEstimator, TransformerMixin):
    def __init__(self, punct_list, stop_list):
        self.punct_list = punct_list
        self.stop_list = stop_list

    def fit(self, X, y = None):
        # Based on code I saw here: https://www.reddit.com/r/learnmachinelearning/comments/5onknw/python_nlp_need_advice_about_gensim_phrasesphraser/
        # Initialize stemmer
        from gensim.models.phrases import Phrases
        from gensim.models.phrases import Phraser
        from nltk.stem.lancaster import LancasterStemmer
        from nltk import word_tokenize
        lancaster_stemmer = LancasterStemmer()
        # Set lists of characters/words to exclude
        punct_list = self.punct_list
        stop_list = self.stop_list
        # Get sentence stream from titles
        bigram_stream = [[lancaster_stemmer.stem(i.lower()) for i in word_tokenize(sent) if i not in punct_list and i not in stop_list] for sent in list(X)]
        bigram = Phraser(Phrases(bigram_stream, min_count=3, threshold=3, delimiter=b' '))
        self.bigram = bigram
        return(self)
    
    def transform(self, X):
        from gensim.models.phrases import Phrases
        from gensim.models.phrases import Phraser
        from nltk.stem.lancaster import LancasterStemmer
        from nltk import word_tokenize
        lancaster_stemmer = LancasterStemmer()
        punct_list = self.punct_list
        stop_list = self.stop_list
        bigram = self.bigram
        x_list = []
        for j in X:
            doc = [lancaster_stemmer.stem(i) for i in word_tokenize(j) if i not in punct_list and i not in stop_list]
            x_list.append("-".join(bigram[doc]))
            
        return(pd.Series(x_list))

# Function to use as custom tokenizer for results of PhraseBigram.transform
def dash_tokenizer(sent):
    return(sent.split("-"))

#Get the job metadata and job descriptions that were scraped previously from Indeed in a dataframe
job_metadata = pd.read_csv("job_ad_metadata_v2.csv")
job_descriptions = pd.read_csv("job_ad_descriptions_v2.csv")

In [2]:
# Intialize pipeline for extracting text features
title_phrase_model = PhraseBigram([".", "-", "_", "!", "?", "[", "]", "(", ")", "%", "$", "&", ",", "/", ":", "–", " "], ["the", "of", "a", "CA"])
job_title_count_vec = CountVectorizer(max_df = 0.95, min_df = 5, stop_words = 'english', tokenizer = dash_tokenizer)
title_pipeline = Pipeline([('phrase_model', title_phrase_model), ('count_vec', job_title_count_vec)])

In [3]:
# Fit pipeline on full data set
title_pipe_fit = title_pipeline.fit(job_metadata.job_title)

# Store components of of the fitted pipeline
title_phrase_model = title_pipe_fit.named_steps["phrase_model"]
title_count_model = title_pipe_fit.named_steps["count_vec"]

In [4]:
# Create state label as above for jobs for which I was able to scrape a job description
job_descriptions_omit = job_descriptions.dropna()

# Initialize models for job descriptions
descr_phrase_model = PhraseBigram([".", "-", "_", "!", "?", "[", "]", "(", ")", "%", "$", "&", ",", "/", ":", "–", " ", ">", "#", "@"], ["the", "of", "a", "CA"])
descr_count_vec = CountVectorizer(max_df = 0.95, min_df = 20, stop_words = 'english', tokenizer = dash_tokenizer)
descr_pipeline = Pipeline([('phrase_model', descr_phrase_model), ('count_vec', descr_count_vec)])

# Fit feature models for job descriptions and get transformed features
descr_pipe_fit = descr_pipeline.fit(job_descriptions_omit.description)
descr_phrase_model = descr_pipe_fit.named_steps["phrase_model"]
descr_count_model = descr_pipe_fit.named_steps["count_vec"]
descr_transform_dense = descr_count_model.fit_transform(descr_phrase_model.fit_transform(job_descriptions_omit.description)).todense()
descr_transform_dense_names = descr_pipe_fit.get_params()["count_vec"].get_feature_names()
descr_count_dense = pd.DataFrame(descr_transform_dense)
descr_count_dense.columns = descr_transform_dense_names

In [5]:
# Get feature names
title_transform_dense = title_count_model.fit_transform(title_phrase_model.fit_transform(job_metadata.job_title)).todense()
title_transform_dense_names = title_pipeline.get_params()["count_vec"].get_feature_names()

# Refit the models for the titles that also have descriptions
title_descr_dense = title_count_model.transform(title_phrase_model.transform(job_descriptions_omit.job_title)).todense()
title_descr_dense = pd.DataFrame(title_descr_dense)
title_descr_dense.columns = title_transform_dense_names

In [6]:
# Contingency table for a job title containing the token "stat" and the description containing the token "sas"
stat_sas_table = pd.crosstab(title_descr_dense["stat"], descr_count_dense["sas"] == 0)
stat_sas_table

sas,False,True
stat,Unnamed: 1_level_1,Unnamed: 2_level_1
0,37,790
1,12,15


In [7]:
# Get p-value using fisher's exact test
sp.stats.fisher_exact(stat_sas_table)[1]

2.9536023505546214e-09