# Salary estimator from listings

The city_state.json file was modified from this github repo [agalea91 - city_to_state_dictionary](https://github.com/agalea91/city_to_state_dictionary/blob/master/city_to_state.py).

The state_abbr.json file was modified from this github repo [JeffPaine - us_state_abbreviations.py](https://gist.github.com/JeffPaine/3083347).

## Setup

In [3]:
%pip install pandas xgboost scikit-learn plotly gensim

Note: you may need to restart the kernel to use updated packages.


First we must import our packages to manage the dataset. Then we can import the data.

## Get the data

Now we can load the dataset.

In [10]:
import os
import json
import re
import pandas as pd
import numpy as np
from scipy import stats
#import gensim.downloader as api
import gensim
import xgboost as xgb
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [14]:
repo_path = os.path.abspath('')
state_abbr = dict(json.load(open(repo_path + '/state_abbr.json')))
city_state = dict(json.load(open(repo_path + '/city_state.json')))
csv_df = pd.read_csv(repo_path + '/archive/postings.csv')
csv_df.head()

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,expiry,closed_time,formatted_experience_level,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,20.0,HOURLY,"Princeton, NJ",2774458.0,20.0,,...,1715990000000.0,,,Requirements: \n\nWe are seeking a College or ...,1713398000000.0,,0,FULL_TIME,USD,BASE_SALARY
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",50.0,HOURLY,"Fort Collins, CO",,1.0,,...,1715450000000.0,,,,1712858000000.0,,0,FULL_TIME,USD,BASE_SALARY
2,10998357,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,65000.0,YEARLY,"Cincinnati, OH",64896719.0,8.0,,...,1715870000000.0,,,We are currently accepting resumes for FOH - A...,1713278000000.0,,0,FULL_TIME,USD,BASE_SALARY
3,23221523,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,175000.0,YEARLY,"New Hyde Park, NY",766262.0,16.0,,...,1715488000000.0,,,This position requires a baseline understandin...,1712896000000.0,,0,FULL_TIME,USD,BASE_SALARY
4,35982263,,Service Technician,Looking for HVAC service tech with experience ...,80000.0,YEARLY,"Burlington, IA",,3.0,,...,1716044000000.0,,,,1713452000000.0,,0,FULL_TIME,USD,BASE_SALARY


In [15]:
df = csv_df.copy()
columns_to_drop = [
    'views','applies','original_listed_time','remote_allowed','job_posting_url','application_url','application_type',
    'expiry','closed_time','listed_time','posting_domain','sponsored','compensation_type','sponsored',
    ]
if 'views' in df.columns:
    df.drop(columns_to_drop, axis=1, inplace=True)

filtered_df = df

In [16]:
df  = filtered_df.copy()
df['state'] = ''

def get_by_regex(dict_map: dict, search: str):
    for k in dict_map.keys():
        result = re.search(k,search,flags=re.I)
        if result is not None:
            return state_abbr.get(k)
    return ''

def clean_state(row):
    
    if row['location'] != row['location']: 
        return row

    location = row['location'].strip().split(',')
    
    if len(location) == 0:
        return row
    
    state = ''
    
    if len(location) > 1:
        state = location[1].strip().upper()
            
    if len(state) != 2:
        state = get_by_regex(state_abbr, row['location'])
    
    # if len(state) != 2:
    #     state_name = get_by_regex(city_state, row['location'])
    #     if state_name != '':
    #         state = state_abbr.get(state)

    if state != None and len(state) == 2:
        row['state'] = state
    else:
        row['state'] = None
        
    return row

df = df.apply(clean_state, axis=1, result_type='broadcast')

def get_abnormal(ser):
    return ser[ser.str.len() != 2].unique()

print(get_abnormal(df['state']))

state_df = df.copy()

[None]


In [18]:
df = state_df.copy()  
pay_cols = ['max_salary','med_salary','min_salary']
df.dropna(thresh=1, subset=pay_cols, inplace=True)

df = df[pay_cols + ['company_name','title','state','pay_period']]
df = df.loc[df['pay_period'] != 'HOURLY'].copy()
for name in pay_cols:
    mask = np.abs(stats.zscore(df[name].astype(float), nan_policy='omit')) > 2
    df[name] = df[name].mask(mask, np.NaN)

df['avg_salary'] = df[pay_cols].mean(axis=1)

salary_df = df

In [19]:
df = salary_df[['state','avg_salary']].copy()

groups = df.groupby('state')
group_count = groups.count()
df = groups.mean()
df['count'] = group_count
df = df.dropna(axis=1).sort_values(by='avg_salary')

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(
        x = df.index.values, 
        y=df['avg_salary'],
        name="Average Salary",
    ), 
    secondary_y=False)

fig.add_trace(
    go.Scatter(
        x = df.index.values,
        y = df['count'],
        name="Sample Size"
    ),
    secondary_y=True
)

fig.update_xaxes(title_text="State",tickangle=90)

# Set y-axes titles
fig.update_yaxes(title_text="Dollars per year", secondary_y=False)
fig.update_yaxes(title_text="Job Listings (log)", secondary_y=True, type="log")

fig.show()

In [37]:
tokenized_csv = repo_path +'/archive/tokenized_jobs.csv'
if(os.path.isfile(tokenized_csv)):
    df = state_df[['title','description']].copy().dropna(thresh=2, axis=1)
    def simple_preprocess(row, keys):
        for key in keys:
            if(isinstance(row[key], str)):
                row[key] = gensim.utils.simple_preprocess(row[key])
            else:
                row[key] = None
        return row

    df = df.apply(lambda x: simple_preprocess(x, ['title', 'description']), axis=1, result_type='expand')
    df.head()
    df.to_csv(tokenized_csv)
else:
    if(tokenized_df != tokenized_df):
        df = pd.read_csv(tokenized_csv) 
tokenized_df = df
print(tokenized_df.head())

In [None]:
df = state_df[['state','title','job_id']].copy()

df = df.groupby(['state','title']).size().to_frame(name = 'count').reset_index().dropna(axis=1)

fig = go.Figure(data=[go.Scatter3d(x=df['state'], y=df['title'], z=df['count'], mode='markers')])

fig.update_xaxes(title_text="State")
fig.update_yaxes(title_text="Position")

fig.show()

  state                                            title  count
0    AK                            2 Way Radio Installer      1
1    AK  9187-Department Manager-Eielson Shopping Center      1
2    AK              Analytics Engagement Specialist III      1
3    AK                      Analytics Grants Specialist      1
4    AK                                       Area Coach      1
