# Salary estimator from listings

The city_state.json file was modified from this github repo [agalea91 - city_to_state_dictionary](https://github.com/agalea91/city_to_state_dictionary/blob/master/city_to_state.py).

The state_abbr.json file was modified from this github repo [JeffPaine - us_state_abbreviations.py](https://gist.github.com/JeffPaine/3083347).

The job posting dataset can be found on Kaggle [LinkedIn Job Postings (2023 - 2024)](https://www.kaggle.com/datasets/arshkon/linkedin-job-postings)

## Setup

In [83]:
%%capture
%pip install pandas xgboost scikit-learn plotly gensim #swifter
print('')

First we must import our packages to manage the dataset. Then we can import the data.

## Setup
Import the many packages

In [84]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from importlib import reload
from IPython.display import HTML, display
from data import DataManager
from wordmod import Job2Vec
from catword import Categorizer

def load_scripts():
    reload(DataManager)
    reload(Job2Vec)
    reload(Categorizer)
    return (DataManager, Job2Vec, Categorizer)

# (DataManager, Job2Vec, Categorizer) = load_scripts()

Extract the job posting data from the CSV and clean it.

In [85]:
dm = DataManager()
df = dm.get_postings().copy()

print(df.info())

def shorten_long_cols(row):
    for name in ['description','skills_desc']:
        if isinstance(row[name], str):
            row[name] = row[name][:150] + '...' 
    return row

display(HTML(df.head(3).apply(shorten_long_cols, axis=1).to_html()))


Retrieving an existing dataset at c:\dev\job-estimator/archive/clean_postings.bin
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123849 entries, 0 to 123848
Data columns (total 17 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   job_id                      123849 non-null  int64  
 1   company_name                122130 non-null  object 
 2   title                       123849 non-null  object 
 3   description                 123842 non-null  object 
 4   max_salary                  29417 non-null   float64
 5   pay_period                  36073 non-null   object 
 6   location                    123849 non-null  object 
 7   company_id                  122132 non-null  float64
 8   med_salary                  6199 non-null    float64
 9   min_salary                  29369 non-null   float64
 10  formatted_work_type         123849 non-null  object 
 11  formatted_experience_level  94440 non-null   obj

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,med_salary,min_salary,formatted_work_type,formatted_experience_level,skills_desc,work_type,currency,state,avg_salary
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in New Jersey is seeking an administrative Marketing Coordinator with some experience in graphic design. You...,38798.991928,HOURLY,"Princeton, NJ",2774458.0,,32979.143139,Full-time,,"Requirements: \n\nWe are seeking a College or Graduate Student (can also be completed with school) with a focus in Planning, Architecture, Real Estate D...",FULL_TIME,USD,NJ,35889.067533
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committed to serving clients with best practices to help them with change, improvements and better quality of l...",96997.47982,HOURLY,"Fort Collins, CO",,,58198.487892,Full-time,,,FULL_TIME,USD,CO,77597.983856
2,10998357,The National Exemplar,Assitant Restaurant Manager,"The National Exemplar is accepting applications for an Assistant Restaurant Manager.\nWe offer highly competitive wages, healthcare, paid time off, com...",65000.0,YEARLY,"Cincinnati, OH",64896719.0,,45000.0,Full-time,,We are currently accepting resumes for FOH - Asisstant Restaurant Management with a strong focus on delivering high quality customer service. Prefer 1...,FULL_TIME,USD,OH,55000.0


### Create a statistical summary of the data.

In [86]:
pay_period_types = ['YEARLY','MONTHLY', 'BIWEEKLY', 'WEEKLY', 'HOURLY']
    
pay_cols = ['max_salary','med_salary','min_salary']
for pay_period in pay_period_types:
    pay_period_df = df.loc[df['pay_period']==pay_period, pay_cols]
    print(pay_period)
    display(HTML(pay_period_df.describe().style.format(precision=0,thousands=",").to_html()))    


YEARLY


Unnamed: 0,max_salary,med_salary,min_salary
count,18736,1451,18701
mean,139481,84890,98622
std,83751,46189,50106
min,12000,10000,10000
25%,85000,54000,65000
50%,120000,72000,89250
75%,170000,105400,120000
max,1500000,300500,750000


MONTHLY


Unnamed: 0,max_salary,med_salary,min_salary
count,285,224,277
mean,103996,36286,73636
std,107141,26630,77116
min,12000,12000,12000
25%,66996,25398,45864
50%,89364,28716,68208
75%,119184,36000,81120
max,1320000,300000,1080000


BIWEEKLY


Unnamed: 0,max_salary,med_salary,min_salary
count,9,0.0,9
mean,74550,,60370
std,14119,,6909
min,53482,,53482
25%,66976,,54009
50%,81434,,58370
75%,87516,,64519
max,89965,,71323


WEEKLY


Unnamed: 0,max_salary,med_salary,min_salary
count,177,0.0,177
mean,111668,,111452
std,21408,,21370
min,50804,,50804
25%,100360,,99684
50%,109148,,109148
75%,119658,,119658
max,210548,,210548


HOURLY


Unnamed: 0,max_salary,med_salary,min_salary
count,10210,4524,10205
mean,74286,49051,59137
std,47162,32504,38183
min,10335,10335,10100
25%,42679,31621,34919
50%,58354,38838,46559
75%,96997,54319,77598
max,581985,288994,484987


### Display a bar graph of average salaries by state.

In [87]:
df = dm.get_postings_with_pay()[['state','avg_salary']].copy()

groups = df.groupby('state')
group_count = groups.count()
df = groups.mean()
df['count'] = group_count
df = df.dropna(axis=1).sort_values(by='avg_salary')

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(
        x = df.index.values, 
        y=df['avg_salary'],
        name="Average Salary",
    ), 
    secondary_y=False)

fig.add_trace(
    go.Scatter(
        x = df.index.values,
        y = df['count'],
        name="Sample Size"
    ),
    secondary_y=True
)

fig.update_xaxes(title_text="State",tickangle=90)

# Set y-axes titles
fig.update_yaxes(title_text="Dollars per year", secondary_y=False)
fig.update_yaxes(title_text="Job Listings (log)", secondary_y=True, type="log")

fig.show()

Dropping rows where every pay column is empty.


Create a dataset

In [88]:
print('Loading j2v word vectors.')
job2vec = Job2Vec()
j2v = job2vec.get_model()

Loading j2v word vectors.
Retrieving an existing model from c:\dev\job-estimator/assets/w2v/w2v.model


### Create a model to genereate entity embeddings for XGBoost

In [104]:
df = dm.get_postings().copy()

x_cols=['state',
        'pay_period',
        'formatted_work_type',
        'formatted_experience_level',
        'title']
y_col = 'avg_salary'

mask = df[['title', 'state', y_col]].notna().all(axis=1) & df[y_col].gt(0)

df = df[x_cols+[y_col]].loc[mask].copy().reset_index()

In [106]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split, cross_val_score
import xgboost as xgb

vector_length = job2vec.get_vector_length()

def title_to_vec(titles: pd.DataFrame):
    vector_cols = [f'title{n}' for n in range(job2vec.get_vector_length())]
    rows = [job2vec.vectorize(x) for x in titles.values]
    return pd.DataFrame(rows, columns=vector_cols)

title_pipe = Pipeline(steps=[
    ("to_vec", FunctionTransformer(title_to_vec))
])

cat_pipe = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore")),
])

non_title_cols = ['state', 'pay_period', 'formatted_work_type', 'formatted_experience_level']

col_transformer = ColumnTransformer(transformers=[
    ("cat", cat_pipe, non_title_cols),
    ("title", title_pipe, ['title'])
])

preprocessor = Pipeline(steps=[
    ('col_trfm', col_transformer)
    #('to_dmx', FunctionTransformer(xgb.DMatrix))
])

x, y = df[x_cols], df[y_col]

preprocessor = preprocessor.fit(x,y)

In [135]:
xgb_reg: xgb.XGBRegressor = xgb.XGBRegressor(
    booster='dart',
    random_state=1,
    n_estimators=40,
    max_depth=6,
    eta=0.1, 
    subsample=0.7, 
    colsample_bytree=0.8, 
    objective='reg:squarederror',
    eval_metric='mae',
    early_stopping_rounds=10,
    )

training_pipe = Pipeline(steps=[
    ("preprocess", preprocessor),
    ('reg', xgb_reg)
])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=1)
x_test = preprocessor.transform(x_test)

xgb_pipe = training_pipe.fit(x_train, y_train, 
                             reg__eval_set=[(x_test, y_test)],
                             )

print(xgb_reg.best_score)

[0]	validation_0-mae:42431.55911
[1]	validation_0-mae:40882.95235
[2]	validation_0-mae:39672.90364
[3]	validation_0-mae:38467.20075
[4]	validation_0-mae:37379.19312
[5]	validation_0-mae:36599.35112
[6]	validation_0-mae:35789.77547
[7]	validation_0-mae:35138.62161
[8]	validation_0-mae:34536.86287
[9]	validation_0-mae:34065.23017
[10]	validation_0-mae:33682.70845
[11]	validation_0-mae:33366.54325
[12]	validation_0-mae:33073.70173
[13]	validation_0-mae:32789.25170
[14]	validation_0-mae:32574.29099
[15]	validation_0-mae:32373.91274
[16]	validation_0-mae:32203.14828
[17]	validation_0-mae:32109.11060
[18]	validation_0-mae:31989.28685
[19]	validation_0-mae:31859.99447
[20]	validation_0-mae:31754.10971
[21]	validation_0-mae:31672.50186
[22]	validation_0-mae:31578.64976
[23]	validation_0-mae:31513.58019
[24]	validation_0-mae:31443.76191
[25]	validation_0-mae:31399.70420
[26]	validation_0-mae:31368.92921
[27]	validation_0-mae:31339.52294
[28]	validation_0-mae:31310.77158
[29]	validation_0-mae:31

In [130]:
xgb_reg.save_model('c:/dev/job-estimator/assets/XGBReggressor.ubj')

test = df.head(10).copy()
res = xgb_pipe.predict(test[x_cols])
test['actual']=res
display(HTML(test.style.format(precision=2,thousands=",").to_html())) 

Unnamed: 0,index,state,pay_period,formatted_work_type,formatted_experience_level,title,avg_salary,actual
0,0,NJ,HOURLY,Full-time,,Marketing Coordinator,35889.07,54070.73
1,1,CO,HOURLY,Full-time,,Mental Health Therapist/Counselor,77597.98,54070.73
2,2,OH,YEARLY,Full-time,,Assitant Restaurant Manager,55000.0,100457.7
3,3,NY,YEARLY,Full-time,,Senior Elder Law / Trusts and Estates Associate Attorney,157500.0,118873.19
4,4,IA,YEARLY,Full-time,,Service Technician,70000.0,95204.81
5,5,NC,HOURLY,Internship,,Economic Development and Planning Intern,32979.14,48394.88
6,7,CA,YEARLY,Full-time,,Building Engineer,105000.0,131343.16
7,16,OH,HOURLY,Full-time,,Administrative Coordinator,48498.74,52789.23
8,17,RI,HOURLY,Part-time,,Customer Service / Reservationist,17381.02,26294.75
9,28,PA,HOURLY,Full-time,,General Laborer,40738.94,54070.73


In [None]:
df = df[['title','state','avg_salary']].copy().dropna(axis=1)

fig = go.Figure(data=[go.Scatter3d(x=df['state'], y=df['title'], z=df['avg_salary'], mode='markers')])

fig.update_xaxes(title_text="State")
fig.update_yaxes(title_text="Position")

fig.show()

KeyError: 'state'

In [None]:

categorizer = Categorizer(j2v.wv, job2vec.tokenize)
categorizer.create_category_vectors()

df = dm.categorize_job_titles(categorizer.get_similar_categories)