# Salary estimator from listings

The city_state.json file was modified from this github repo [agalea91 - city_to_state_dictionary](https://github.com/agalea91/city_to_state_dictionary/blob/master/city_to_state.py).

The state_abbr.json file was modified from this github repo [JeffPaine - us_state_abbreviations.py](https://gist.github.com/JeffPaine/3083347).

The job posting dataset can be found on Kaggle [LinkedIn Job Postings (2023 - 2024)](https://www.kaggle.com/datasets/arshkon/linkedin-job-postings)

## Setup

In [1]:
%%capture
%pip install pandas xgboost scikit-learn plotly gensim #swifter
print('')

First we must import our packages to manage the dataset. Then we can import the data.

## Setup
Import the many packages

In [2]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from importlib import reload
from IPython.display import HTML, display
from data import DataManager
from wordmod import Job2Vec
from catword import Categorizer

def load_scripts():
    reload(DataManager)
    reload(Job2Vec)
    reload(Categorizer)
    return (DataManager, Job2Vec, Categorizer)

# (DataManager, Job2Vec, Categorizer) = load_scripts()

Extract the job posting data from the CSV and clean it.

In [3]:
dm = DataManager()
df = dm.get_postings().copy()

def shorten_long_cols(row):
    for name in ['description','skills_desc']:
        if isinstance(row[name], str):
            row[name] = row[name][:150] + '...' 
    return row

display(HTML(df.head(5).apply(shorten_long_cols, axis=1).to_html()))

Retrieving an existing dataset at c:\dev\job-estimator/archive/clean_postings.bin


Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,med_salary,min_salary,formatted_work_type,formatted_experience_level,skills_desc,work_type,currency,state,avg_salary
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in New Jersey is seeking an administrative Marketing Coordinator with some experience in graphic design. You...,38798.991928,HOURLY,"Princeton, NJ",2774458.0,,32979.143139,Full-time,,"Requirements: \n\nWe are seeking a College or Graduate Student (can also be completed with school) with a focus in Planning, Architecture, Real Estate D...",FULL_TIME,USD,NJ,35889.067533
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committed to serving clients with best practices to help them with change, improvements and better quality of l...",96997.47982,HOURLY,"Fort Collins, CO",,,58198.487892,Full-time,,,FULL_TIME,USD,CO,77597.983856
2,10998357,The National Exemplar,Assitant Restaurant Manager,"The National Exemplar is accepting applications for an Assistant Restaurant Manager.\nWe offer highly competitive wages, healthcare, paid time off, com...",65000.0,YEARLY,"Cincinnati, OH",64896719.0,,45000.0,Full-time,,We are currently accepting resumes for FOH - Asisstant Restaurant Management with a strong focus on delivering high quality customer service. Prefer 1...,FULL_TIME,USD,OH,55000.0
3,23221523,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associate Attorney,"Senior Associate Attorney - Elder Law / Trusts and Estates Our legal team is committed to providing each client with quality counsel, innovative solu...",175000.0,YEARLY,"New Hyde Park, NY",766262.0,,140000.0,Full-time,,"This position requires a baseline understanding of online marketing including Search Engine Marketing, Search Engine Optimization, and campaign analyt...",FULL_TIME,USD,NY,157500.0
4,35982263,,Service Technician,Looking for HVAC service tech with experience in commerical and industrial equipment. Minimum 5 yrs. on the job with mechanical license. Winger is a f...,80000.0,YEARLY,"Burlington, IA",,,60000.0,Full-time,,,FULL_TIME,USD,IA,70000.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123849 entries, 0 to 123848
Data columns (total 17 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   job_id                      123849 non-null  int64  
 1   company_name                122130 non-null  object 
 2   title                       123849 non-null  object 
 3   description                 123842 non-null  object 
 4   max_salary                  29417 non-null   float64
 5   pay_period                  36073 non-null   object 
 6   location                    123849 non-null  object 
 7   company_id                  122132 non-null  float64
 8   med_salary                  6199 non-null    float64
 9   min_salary                  29369 non-null   float64
 10  formatted_work_type         123849 non-null  object 
 11  formatted_experience_level  94440 non-null   object 
 12  skills_desc                 2439 non-null    object 
 13  work_type     

### Create a statistical summary of the data.

In [5]:
def print_pay_summary(x: pd.DataFrame):
    pay_df = x[pay_cols]
    summary_df = pd.DataFrame([], columns=['pay_col','min','Q1','median','Q3','max','mean','mode'])
    for c in pay_cols:
        qs = [c]+pay_df[c].dropna().quantile([0,.25, .5, .75, 1]).tolist()+[pay_df[c].mean(), pay_df[c].mode().tolist()]
        summary_df.loc[-1] = qs
        summary_df.index+=1
    return summary_df

pay_period_types = ['YEARLY','MONTHLY', 'BIWEEKLY', 'WEEKLY', 'HOURLY']
    
pay_cols = ['max_salary','med_salary','min_salary']
for pay_period in pay_period_types:
    pay_period_df = df.loc[df['pay_period']==pay_period, pay_cols]
    print(pay_period)
    summary = print_pay_summary(pay_period_df)
    
    display(HTML(summary.style.format(precision=0,thousands=",").to_html()))    


YEARLY


Unnamed: 0,pay_col,min,Q1,median,Q3,max,mean,mode
2,max_salary,12000,85000,120000,170000,1500000,139481,[150000.0]
1,med_salary,10000,54000,72000,105400,300500,84890,[60000.0]
0,min_salary,10000,65000,89250,120000,750000,98622,[100000.0]


MONTHLY


Unnamed: 0,pay_col,min,Q1,median,Q3,max,mean,mode
2,max_salary,12000,66996,89364,119184,1320000,103996,[85368.0]
1,med_salary,12000,25398,28716,36000,300000,36286,[30000.0]
0,min_salary,12000,45864,68208,81120,1080000,73636,[68208.0]


BIWEEKLY


Unnamed: 0,pay_col,min,Q1,median,Q3,max,mean,mode
2,max_salary,53482.0,66976.0,81434.0,87516.0,89965.0,74550.0,[66976.0]
1,med_salary,,,,,,,[]
0,min_salary,53482.0,54009.0,58370.0,64519.0,71323.0,60370.0,[54009.0]


WEEKLY


Unnamed: 0,pay_col,min,Q1,median,Q3,max,mean,mode
2,max_salary,50804.0,100360.0,109148.0,119658.0,210548.0,111668.0,"[95726.8, 114884.64000000001, 117303.16, 120931.2]"
1,med_salary,,,,,,,[]
0,min_salary,50804.0,99684.0,109148.0,119658.0,210548.0,111452.0,"[95726.8, 114884.64000000001, 117303.16, 120931.2]"


HOURLY


Unnamed: 0,pay_col,min,Q1,median,Q3,max,mean,mode
2,max_salary,10335,42679,58354,96997,581985,74286,[48498.73990982143]
1,med_salary,10335,31621,38838,54319,288994,49051,[38798.99192785715]
0,min_salary,10100,34919,46559,77598,484987,59137,[38798.99192785715]


### Display a bar graph of average salaries by state.

In [6]:
df = dm.get_postings_with_pay()[['state','avg_salary']].copy()

groups = df.groupby('state')
group_count = groups.count()
df = groups.mean()
df['count'] = group_count
df = df.dropna(axis=1).sort_values(by='avg_salary')

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(
        x = df.index.values, 
        y=df['avg_salary'],
        name="Average Salary",
    ), 
    secondary_y=False)

fig.add_trace(
    go.Scatter(
        x = df.index.values,
        y = df['count'],
        name="Sample Size"
    ),
    secondary_y=True
)

fig.update_xaxes(title_text="State",tickangle=90)

# Set y-axes titles
fig.update_yaxes(title_text="Dollars per year", secondary_y=False)
fig.update_yaxes(title_text="Job Listings (log)", secondary_y=True, type="log")

fig.show()

Dropping rows where every pay column is empty.


Create a dataset

In [7]:
print('Loading j2v word vectors.')
job2vec = Job2Vec()
j2v = job2vec.get_model()
word_vectors = j2v.wv

categorizer = Categorizer(word_vectors, job2vec.tokenize)
categorizer.create_categories()

df = dm.categorize_job_titles(categorizer.get_similar_categories)
df.info()

Loading j2v word vectors.
Retrieving an existing model from c:\dev\job-estimator/assets/w2v/w2v.model
Retrieving category vectors from c:\dev\job-estimator/assets/w2v/vectorized_categories.bin
Retrieving an existing data at c:\dev\job-estimator/archive/categorized_job_titles.bin
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123849 entries, 0 to 123848
Data columns (total 23 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   job_id                      123849 non-null  int64  
 1   company_name                122130 non-null  object 
 2   title                       123849 non-null  object 
 3   description                 123842 non-null  object 
 4   max_salary                  29417 non-null   float64
 5   pay_period                  36073 non-null   object 
 6   location                    123849 non-null  object 
 7   company_id                  122132 non-null  float64
 8   med_salary              

### Create a model to genereate entity embeddings for XGBoost

In [44]:

from sklearn.model_selection import train_test_split

df = df.loc[df['avg_salary'] > 0].copy()
categorical_cols=['state','pay_period','formatted_work_type','formatted_experience_level','cat0','cat1','cat2']
x = df[['state','pay_period','formatted_work_type','formatted_experience_level','cat0','cat1','cat2']]
y = df['avg_salary']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=1)

train = x_train.to_numpy().flatten()
test = x_test.to_numpy().flatten()

sizes = {col: df[col].nunique() for col in categorical_cols}
embedding_sizes = {col: sizes[col]//2+1 for col in categorical_cols}

from keras.layers import Dense, Dropout, Embedding, Input, Reshape, Concatenate
from keras.models import Model


     job_id            company_name  \
0    921716   Corcoran Sawyer Smith   
1   1829192                     NaN   
2  10998357  The National Exemplar    
3  23221523  Abrams Fensterman, LLP   
4  35982263                     NaN   

                                               title  \
0                              Marketing Coordinator   
1                  Mental Health Therapist/Counselor   
2                        Assitant Restaurant Manager   
3  Senior Elder Law / Trusts and Estates Associat...   
4                                 Service Technician   

                                         description     max_salary  \
0  Job descriptionA leading real estate firm in N...   38798.991928   
1  At Aspen Therapy and Wellness , we are committ...   96997.479820   
2  The National Exemplar is accepting application...   65000.000000   
3  Senior Associate Attorney - Elder Law / Trusts...  175000.000000   
4  Looking for HVAC service tech with experience ...   80000.000000   

 

RuntimeError: No active exception to reraise

### Do OneHotEncoding on categorical variables

In [42]:
from sklearn.preprocessing import OneHotEncoder 

categorical_cols=['state','pay_period','formatted_work_type','formatted_experience_level','cat0']#,'cat1','cat2']
encoder = OneHotEncoder(sparse_output=False).set_output(transform='pandas')
encoded_df = df[['avg_salary','cat0_score','cat1_score','cat2_score']].copy()
for col in categorical_cols:
    enc = encoder.fit_transform(df[[col]])
    encoded_df = pd.concat([encoded_df, enc], axis=1)
print(encoded_df.value_counts().index)

MultiIndex([(           76788.0, 0.36521732807159424, ...),
            (    40738.94152425,  0.4685388207435608, ...),
            (           90000.0, 0.46339064836502075, ...),
            ( 17794.40205528738,  0.3309284746646881, ...),
            (15295.293846889046,  0.8483765125274658, ...),
            (174595.46367535717,  0.6316975951194763, ...),
            (103787.30340701787,  0.6392040252685547, ...),
            ( 37247.03225074286,    0.37578284740448, ...),
            ( 71778.13506653572,  0.8624359965324402, ...),
            (116872.26343468769,    0.77132248878479, ...),
            ...
            (           60420.0, 0.45728716254234314, ...),
            ( 60410.03043167358,  0.5415735244750977, ...),
            ( 60410.03043167358,  0.4781200587749481, ...),
            (60380.931187727685,  0.4882458448410034, ...),
            (60380.931187727685, 0.46584776043891907, ...),
            (           60320.0,  0.4030599594116211, ...),
            (           

In [None]:
df = df[['title','state','avg_salary']].copy().dropna(axis=1)

fig = go.Figure(data=[go.Scatter3d(x=df['state'], y=df['title'], z=df['avg_salary'], mode='markers')])

fig.update_xaxes(title_text="State")
fig.update_yaxes(title_text="Position")

fig.show()

KeyError: 'state'