# Salary estimator from listings

The city_state.json file was modified from this github repo [agalea91 - city_to_state_dictionary](https://github.com/agalea91/city_to_state_dictionary/blob/master/city_to_state.py).

The state_abbr.json file was modified from this github repo [JeffPaine - us_state_abbreviations.py](https://gist.github.com/JeffPaine/3083347).

The job posting dataset can be found on Kaggle [LinkedIn Job Postings (2023 - 2024)](https://www.kaggle.com/datasets/arshkon/linkedin-job-postings)

## Setup

In [9]:
%%capture
%pip install pandas xgboost scikit-learn plotly gensim #swifter
print('')

First we must import our packages to manage the dataset. Then we can import the data.

## Setup
Import the many packages

In [10]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

%load_ext autoreload
%aimport JobPostingManager
%aimport Job2Vec
%aimport categorizer
%aimport settings
%autoreload 1

JPM = JobPostingManager.JobPostingManager
J2V = Job2Vec.Job2Vec

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Extract the job posting data from the CSV and clean it.

In [11]:
jpm = JPM()
df = jpm.postings.copy()

Average vacation days: 13.445833333333333
Retrieving an existing dataset at c:\dev\job-estimator/archive/clean_postings.bin


In [12]:

from IPython.display import HTML, display
def print_pay_summary(x: pd.DataFrame):
    pay_df = x[pay_cols]
    summary_df = pd.DataFrame([], columns=['pay_col','min','Q1','median','Q3','max','mean','mode'])
    for c in pay_cols:
        qs = [c]+pay_df[c].dropna().quantile([0,.25, .5, .75, 1]).tolist()+[pay_df[c].mean(), pay_df[c].mode().tolist()]
        summary_df.loc[-1] = qs
        summary_df.index+=1
    return summary_df

pay_period_types = ['YEARLY','MONTHLY', 'BIWEEKLY', 'WEEKLY', 'HOURLY']
    
pay_cols = ['max_salary','med_salary','min_salary']
for pay_period in pay_period_types:
    pay_period_df = df.loc[df['pay_period']==pay_period, pay_cols]
    print(pay_period)
    summary = print_pay_summary(pay_period_df)
    
    display(HTML(summary.style.format(precision=0,thousands=",").to_html()))    


YEARLY


Unnamed: 0,pay_col,min,Q1,median,Q3,max,mean,mode
2,max_salary,12000,85000,120000,170000,1500000,139481,[150000.0]
1,med_salary,10000,54000,72000,105400,300500,84890,[60000.0]
0,min_salary,10000,65000,89250,120000,750000,98622,[100000.0]


MONTHLY


Unnamed: 0,pay_col,min,Q1,median,Q3,max,mean,mode
2,max_salary,12000,66996,89364,119184,1320000,103996,[85368.0]
1,med_salary,12000,25398,28716,36000,300000,36286,[30000.0]
0,min_salary,12000,45864,68208,81120,1080000,73636,[68208.0]


BIWEEKLY


Unnamed: 0,pay_col,min,Q1,median,Q3,max,mean,mode
2,max_salary,53482.0,66976.0,81434.0,87516.0,89965.0,74550.0,[66976.0]
1,med_salary,,,,,,,[]
0,min_salary,53482.0,54009.0,58370.0,64519.0,71323.0,60370.0,[54009.0]


WEEKLY


Unnamed: 0,pay_col,min,Q1,median,Q3,max,mean,mode
2,max_salary,50804.0,100360.0,109148.0,119658.0,210548.0,111668.0,"[95726.8, 114884.64000000001, 117303.16, 120931.2]"
1,med_salary,,,,,,,[]
0,min_salary,50804.0,99684.0,109148.0,119658.0,210548.0,111452.0,"[95726.8, 114884.64000000001, 117303.16, 120931.2]"


HOURLY


Unnamed: 0,pay_col,min,Q1,median,Q3,max,mean,mode
2,max_salary,10335,42679,58354,96997,581985,74286,[48498.73990982143]
1,med_salary,10335,31621,38838,54319,288994,49051,[38798.99192785715]
0,min_salary,10100,34919,46559,77598,484987,59137,[38798.99192785715]


In [13]:
df = jpm.postings_with_pay[['state','avg_salary']].copy()

groups = df.groupby('state')
group_count = groups.count()
df = groups.mean()
df['count'] = group_count
df = df.dropna(axis=1).sort_values(by='avg_salary')

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(
        x = df.index.values, 
        y=df['avg_salary'],
        name="Average Salary",
    ), 
    secondary_y=False)

fig.add_trace(
    go.Scatter(
        x = df.index.values,
        y = df['count'],
        name="Sample Size"
    ),
    secondary_y=True
)

fig.update_xaxes(title_text="State",tickangle=90)

# Set y-axes titles
fig.update_yaxes(title_text="Dollars per year", secondary_y=False)
fig.update_yaxes(title_text="Job Listings (log)", secondary_y=True, type="log")

fig.show()

Dropping rows where every pay column is empty.


In [14]:


#tokenized_df = job2vec.dataset
#print(tokenized_df.head())


In [15]:
import json

from gensim.models import KeyedVectors
job2vec = J2V(jpm.postings)
j2v = job2vec.get_model()
wv = j2v.wv
categories = json.load(open(settings.REPO_PATH + '/assets/bls_gov_jobs.json'))

changed = []
for x in categories:
    if len(x) > 1:
        changed.append([x[1],x[2]])
    else:
        changed.append(x)

print(changed[:50])

tkn = job2vec.tokenize(sentence)
'''
#df.to_pickle(settings.REPO_PATH + '/assets/vectorized_categories.bin')





# # inspired by https://github.com/piskvorky/gensim/blob/develop/gensim/models/keyedvectors.py#L655
# best = matutils.argsort(distances(), topn=topn + len(all_keys), reverse=True)
top_postings: pd.DataFrame = jpm.postings.head(100)
print(top_postings['title'])
categorized = []

for post in top_postings['title']:
    if not post:
        continue
    category = categorize(post)
    if isinstance(category, tuple):
        categorized.append([post,category[0]])

categorized = pd.DataFrame(categorized)

categorized.to_csv(settings.REPO_PATH + '/assets/categorized_titles.csv')
'''

Retrieving an existing model from c:\dev\job-estimator/assets/w2v/w2v.model


IndexError: list index out of range

In [None]:
print(j2v)

Word2Vec<vocab=74766, vector_size=300, alpha=0.025>


In [None]:
#import os
#import pickle
#from sklearn.cluster import MiniBatchKMeans

#kmeans_path = settings.REPO_PATH +'/assets/kmeans/model.bin'   
#print('Training....')
#model = MiniBatchKMeans(n_clusters=200, max_iter=1000, random_state=1, batch_size=(257 * os.cpu_count()), max_no_improvement=100, verbose=1).fit(X=j2v.wv.vectors)

#print('Dumping the model...')
#pickle.dump(model, open(kmeans_path,'wb'))

#categories = model.cluster_centers_
#for i in range(10):
#    vectors = j2v.wv.similar_by_vector(model.cluster_centers_[i], topn=10, restrict_vocab=None)
#    print(vectors)

Training....
Dumping the model...


In [None]:
df = jpm.clean[['state','title','job_id']].copy()

df = df.groupby(['state','title']).size().to_frame(name = 'count').reset_index().dropna(axis=1)

fig = go.Figure(data=[go.Scatter3d(x=df['state'], y=df['title'], z=df['count'], mode='markers')])

fig.update_xaxes(title_text="State")
fig.update_yaxes(title_text="Position")

fig.show()

AttributeError: 'JobPostingManager' object has no attribute 'clean'