# Salary estimator from listings

The city_state.json file was modified from this github repo [agalea91 - city_to_state_dictionary](https://github.com/agalea91/city_to_state_dictionary/blob/master/city_to_state.py).

The state_abbr.json file was modified from this github repo [JeffPaine - us_state_abbreviations.py](https://gist.github.com/JeffPaine/3083347).

The job posting dataset can be found on Kaggle [LinkedIn Job Postings (2023 - 2024)](https://www.kaggle.com/datasets/arshkon/linkedin-job-postings)

## Setup

In [1]:
%%capture
%pip install pandas xgboost scikit-learn plotly gensim #swifter
print('')

First we must import our packages to manage the dataset. Then we can import the data.

## Setup
Import the many packages

In [2]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

global JPM
global J2V
global CTR

def load_scripts():
    %load_ext autoreload
    %aimport JobPostingManager
    %aimport Job2Vec
    %aimport categorizer
    %aimport settings
    %autoreload 1
    
    global JPM
    JPM = JobPostingManager.JobPostingManager
    global J2V
    J2V = Job2Vec.Job2Vec
    global CTR
    CTR = categorizer.Categorizer
    return (JPM, J2V, CTR)

load_scripts()

(JobPostingManager.JobPostingManager, Job2Vec.Job2Vec, categorizer.Categorizer)

Extract the job posting data from the CSV and clean it.

In [3]:
jpm = JPM()
df = jpm.postings.copy()

Average vacation days: 13.445833333333333
Retrieving an existing dataset at c:\dev\job-estimator/archive/clean_postings.bin


In [4]:

from IPython.display import HTML, display
def print_pay_summary(x: pd.DataFrame):
    pay_df = x[pay_cols]
    summary_df = pd.DataFrame([], columns=['pay_col','min','Q1','median','Q3','max','mean','mode'])
    for c in pay_cols:
        qs = [c]+pay_df[c].dropna().quantile([0,.25, .5, .75, 1]).tolist()+[pay_df[c].mean(), pay_df[c].mode().tolist()]
        summary_df.loc[-1] = qs
        summary_df.index+=1
    return summary_df

pay_period_types = ['YEARLY','MONTHLY', 'BIWEEKLY', 'WEEKLY', 'HOURLY']
    
pay_cols = ['max_salary','med_salary','min_salary']
for pay_period in pay_period_types:
    pay_period_df = df.loc[df['pay_period']==pay_period, pay_cols]
    print(pay_period)
    summary = print_pay_summary(pay_period_df)
    
    display(HTML(summary.style.format(precision=0,thousands=",").to_html()))    


YEARLY


Unnamed: 0,pay_col,min,Q1,median,Q3,max,mean,mode
2,max_salary,12000,85000,120000,170000,1500000,139481,[150000.0]
1,med_salary,10000,54000,72000,105400,300500,84890,[60000.0]
0,min_salary,10000,65000,89250,120000,750000,98622,[100000.0]


MONTHLY


Unnamed: 0,pay_col,min,Q1,median,Q3,max,mean,mode
2,max_salary,12000,66996,89364,119184,1320000,103996,[85368.0]
1,med_salary,12000,25398,28716,36000,300000,36286,[30000.0]
0,min_salary,12000,45864,68208,81120,1080000,73636,[68208.0]


BIWEEKLY


Unnamed: 0,pay_col,min,Q1,median,Q3,max,mean,mode
2,max_salary,53482.0,66976.0,81434.0,87516.0,89965.0,74550.0,[66976.0]
1,med_salary,,,,,,,[]
0,min_salary,53482.0,54009.0,58370.0,64519.0,71323.0,60370.0,[54009.0]


WEEKLY


Unnamed: 0,pay_col,min,Q1,median,Q3,max,mean,mode
2,max_salary,50804.0,100360.0,109148.0,119658.0,210548.0,111668.0,"[95726.8, 114884.64000000001, 117303.16, 120931.2]"
1,med_salary,,,,,,,[]
0,min_salary,50804.0,99684.0,109148.0,119658.0,210548.0,111452.0,"[95726.8, 114884.64000000001, 117303.16, 120931.2]"


HOURLY


Unnamed: 0,pay_col,min,Q1,median,Q3,max,mean,mode
2,max_salary,10335,42679,58354,96997,581985,74286,[48498.73990982143]
1,med_salary,10335,31621,38838,54319,288994,49051,[38798.99192785715]
0,min_salary,10100,34919,46559,77598,484987,59137,[38798.99192785715]


In [5]:
df = jpm.postings_with_pay[['state','avg_salary']].copy()

groups = df.groupby('state')
group_count = groups.count()
df = groups.mean()
df['count'] = group_count
df = df.dropna(axis=1).sort_values(by='avg_salary')

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(
        x = df.index.values, 
        y=df['avg_salary'],
        name="Average Salary",
    ), 
    secondary_y=False)

fig.add_trace(
    go.Scatter(
        x = df.index.values,
        y = df['count'],
        name="Sample Size"
    ),
    secondary_y=True
)

fig.update_xaxes(title_text="State",tickangle=90)

# Set y-axes titles
fig.update_yaxes(title_text="Dollars per year", secondary_y=False)
fig.update_yaxes(title_text="Job Listings (log)", secondary_y=True, type="log")

fig.show()

Dropping rows where every pay column is empty.


In [6]:


#tokenized_df = job2vec.dataset
#print(tokenized_df.head())


Create a dataset

In [7]:
import json, pickle
from datasets import

bls_jobs = json.load(open(settings.REPO_PATH +'/assets/bls_gov_jobs.json'))
for i,x in enumerate(bls_jobs):
    joined = ' '.join(x)
    bls_jobs[i] = pd.NA if len(joined) < 4 else joined
bls_jobs = pd.Series(bls_jobs).dropna()


jpm = JPM()
df = jpm.postings.copy() 
 
print("Combining the the bls.gov job list, LinkedIn job title, description and skills, columns to create a single array. The model does not need them separated.")
ser = pd.concat([bls_jobs, df['title'], df['description'], df['skills_desc']], ignore_index=True)


SyntaxError: invalid syntax (2756065597.py, line 2)

In [None]:
'''
import json, pickle
import torch
import torch.nn.functional as filter
from transformers import AutoTokenizer, AutoModel

(JPM, J2V, CTR) = load_scripts()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

name_of_bert = "google-bert/bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(name_of_bert)

def tokenize(sentence: str):
    x = sentence
    if(isinstance(x, str)):
        x = tokenizer(x,padding="max_length", truncation=True, return_tensors="pt").to(device)
    else:
        x = []
    return x

print("Cleaning and tokenizing each row with a helper method from Gensim. This usually takes less than 2 minutes.")
ser = ser.apply(tokenize)

print("Dropping empty rows.")
ser.dropna(inplace=True)

tokenized_data_path = settings.REPO_PATH +'/archive/bert_tokenized_jobs.bin'
    
print("Saving the cleaned data set.")
ser.to_pickle(tokenized_data_path)

model = AutoModel.from_pretrained(name_of_bert)
'''

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Average vacation days: 13.445833333333333
Retrieving an existing dataset at c:\dev\job-estimator/archive/clean_postings.bin
Combining the the bls.gov job list, LinkedIn job title, description and skills, columns to create a single array. The model does not need them separated.
Cleaning and tokenizing each row with a helper method from Gensim. This usually takes less than 2 minutes.


KeyboardInterrupt: 

In [17]:

import json, pickle, os
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
(JPM, J2V, CTR) = load_scripts()

jpm = JPM()

print('Loading j2v word vectors.')
job2vec = J2V()
j2v = job2vec.get_model()
wv = j2v.wv


category_vectors = settings.REPO_PATH + '/assets/w2v/vectorized_categories.bin'
categorizer = CTR(wv, job2vec.tokenize)

if os.path.isfile(category_vectors):
    print("Retrieving an category vectors from "+category_vectors)
    categorizer.replace_vectors(KeyedVectors.load(category_vectors))
else:
    print('Creating categories.')
    categories = json.load(open(settings.REPO_PATH + '/assets/bls_gov_jobs.json'))
    groups: dict[str, list[str]] = {}
    for x in categories:
        category = x[0]
        title = None
        if len(x) > 1:
            category = x[1]
            title = x[0]
        if not category in groups:
             groups[category] = []
        if title:
            groups[category].append(title)

    categories = [[k] for k,v in list(groups.items())]

    print('Creating KeyedVectors from the category names.')
    categorizer = CTR(wv, job2vec.tokenize)
    categorizer.add_categories(categories)

    print('Saving the KeyedVectors.')
    categorizer.kv.save(category_vectors)

raise

df = jpm.postings.copy()

cats_to_save = 2

for i in range(cats_to_save):
    df[f'cat{i}'] = None
    df[f'cat{i}_score'] = 0

def apply_categories(row):
    try:
        categories = categorizer.get_similar_categories(row['title'], cats_to_save)
    except:
        categories = [('',0),('',0),('',0)]
    for i in range(cats_to_save):
        category = categories[i]
        if isinstance(category, tuple):
            row[f'cat{i}'] = str(category)
        row[f'cat{i}'] = categories[i][0]
        row[f'cat{i}_score'] = categories[i][1]
    return row

df: pd.DataFrame = df.apply(apply_categories, axis=1)

print(df.head())

df.to_pickle(settings.REPO_PATH + '/archive/categorized_job_titles.bin')

raise 
top_postings: pd.DataFrame = jpm.postings[['title','description']].dropna().head(100)
print(top_postings.head())
cleaned = top_postings['title'].str.cat(top_postings['description'], sep=' ')
categorized = categorizer.categorize_list_top(top_postings['title'])
categorized = pd.DataFrame(categorized, columns=['title', 'cat0','cat1','cat2'])

categorized.to_csv(settings.REPO_PATH + '/assets/categorized_titles.csv')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Average vacation days: 13.445833333333333
Loading j2v word vectors.
Retrieving an existing model from c:\dev\job-estimator/assets/w2v/w2v.model
Creating categories.
Creating KeyedVectors from the category names.
Saving the KeyedVectors.
Retrieving an existing dataset at c:\dev\job-estimator/archive/clean_postings.bin
     job_id            company_name  \
0    921716   Corcoran Sawyer Smith   
1   1829192                     NaN   
2  10998357  The National Exemplar    
3  23221523  Abrams Fensterman, LLP   
4  35982263                     NaN   

                                               title  \
0                              Marketing Coordinator   
1                  Mental Health Therapist/Counselor   
2                        Assitant Restaurant Manager   
3  Senior Elder Law / Trusts and Estates Associat...   
4                                 Service Technician   

                     

RuntimeError: No active exception to reraise

In [None]:
print(j2v)

NameError: name 'j2v' is not defined

In [None]:
#import os
#import pickle
#from sklearn.cluster import MiniBatchKMeans

#kmeans_path = settings.REPO_PATH +'/assets/kmeans/model.bin'   
#print('Training....')
#model = MiniBatchKMeans(n_clusters=200, max_iter=1000, random_state=1, batch_size=(257 * os.cpu_count()), max_no_improvement=100, verbose=1).fit(X=j2v.wv.vectors)

#print('Dumping the model...')
#pickle.dump(model, open(kmeans_path,'wb'))

#categories = model.cluster_centers_
#for i in range(10):
#    vectors = j2v.wv.similar_by_vector(model.cluster_centers_[i], topn=10, restrict_vocab=None)
#    print(vectors)

Training....
Dumping the model...


In [None]:
df = jpm.clean[['state','title','job_id']].copy()

df = df.groupby(['state','title']).size().to_frame(name = 'count').reset_index().dropna(axis=1)

fig = go.Figure(data=[go.Scatter3d(x=df['state'], y=df['title'], z=df['count'], mode='markers')])

fig.update_xaxes(title_text="State")
fig.update_yaxes(title_text="Position")

fig.show()

AttributeError: 'JobPostingManager' object has no attribute 'clean'