# Salary estimator from listings

## Setup

In [74]:
%pip install pandas xgboost scikit-learn plotly # gensim

Note: you may need to restart the kernel to use updated packages.


ERROR: Invalid requirement: '#'


First we must import our packages to manage the dataset. Then we can import the data.

## Get the data

Now we can load the dataset.

In [75]:
import os
import pandas as pd
import numpy as np
import xgboost as xgb
import plotly.express as px
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [76]:
repo_path = os.path.abspath('')
df = pd.read_csv(repo_path + '/archive/postings.csv')
df.head()

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,expiry,closed_time,formatted_experience_level,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,20.0,HOURLY,"Princeton, NJ",2774458.0,20.0,,...,1715990000000.0,,,Requirements: \n\nWe are seeking a College or ...,1713398000000.0,,0,FULL_TIME,USD,BASE_SALARY
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",50.0,HOURLY,"Fort Collins, CO",,1.0,,...,1715450000000.0,,,,1712858000000.0,,0,FULL_TIME,USD,BASE_SALARY
2,10998357,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,65000.0,YEARLY,"Cincinnati, OH",64896719.0,8.0,,...,1715870000000.0,,,We are currently accepting resumes for FOH - A...,1713278000000.0,,0,FULL_TIME,USD,BASE_SALARY
3,23221523,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,175000.0,YEARLY,"New Hyde Park, NY",766262.0,16.0,,...,1715488000000.0,,,This position requires a baseline understandin...,1712896000000.0,,0,FULL_TIME,USD,BASE_SALARY
4,35982263,,Service Technician,Looking for HVAC service tech with experience ...,80000.0,YEARLY,"Burlington, IA",,3.0,,...,1716044000000.0,,,,1713452000000.0,,0,FULL_TIME,USD,BASE_SALARY


In [77]:
columns_to_drop = [
    'views','applies','original_listed_time','remote_allowed','job_posting_url','application_url','application_type',
    'expiry','closed_time','listed_time','posting_domain','sponsored','compensation_type','sponsored',
    ]
if 'views' in df.columns:
    df.drop(columns_to_drop, axis=1, inplace=True)
print(list(df.columns))

['job_id', 'company_name', 'title', 'description', 'max_salary', 'pay_period', 'location', 'company_id', 'med_salary', 'min_salary', 'formatted_work_type', 'formatted_experience_level', 'skills_desc', 'work_type', 'currency']


In [78]:
pay_cols = ['max_salary','med_salary','min_salary']
df.dropna(thresh=1, subset=pay_cols, inplace=True)
len(df.index)

36073

In [79]:
graph_cols = pay_cols + ['company_name','title','location','pay_period']
pay_df = df[graph_cols].copy()
pay_df['state'] = pay_df['location'].str.split(',').str[1].str.strip().str.upper()
salary_df = pay_df.loc[pay_df['pay_period'] != 'HOURLY']
salary_state_df = salary_df[['state']+pay_cols].copy()

excluded = []

def normalize(val, med, std):
    std2 = std*3
    low = med-std2
    high = med+std2
    if low < val < high:
        return val
    if val == val: 
        excluded.append(val)
    return None

for name in pay_cols:
    med = salary_state_df[name].median()
    std = salary_state_df[name].std()
    salary_state_df[name] = salary_state_df[name].apply(lambda x: normalize(x, med, std))
    
salary_state_df['avg_salary'] = salary_state_df[pay_cols].mean(axis=1)

def get_abnormal(arr: np.array):
    ser = pd.Series(arr.unique())
    result = ser[ser.str.len() != 2]
    return result

print(get_abnormal(salary_state_df['state']))

3                                  NaN
19                       UNITED STATES
21              OHIO METROPOLITAN AREA
31             TEXAS METROPOLITAN AREA
32                          CALIFORNIA
35    SOUTH CAROLINA METROPOLITAN AREA
38            OREGON METROPOLITAN AREA
44                        ALABAMA AREA
45            KANSAS METROPOLITAN AREA
46     MASSACHUSETTS METROPOLITAN AREA
56          NEBRASKA METROPOLITAN AREA
57                            NEW YORK
59                             MI AREA
60                            ILLINOIS
61         LOUISIANA METROPOLITAN AREA
62          NEW YORK METROPOLITAN AREA
66                              HAWAII
67                                OHIO
68                            DELAWARE
70    NORTH CAROLINA METROPOLITAN AREA
71                            COLORADO
72                            NEBRASKA
73                          WASHINGTON
74                       MASSACHUSETTS
75                               TEXAS
76                       

In [80]:
state_map = {
    'ALASKA':'AK',
    'ALABAMA':'AL',
    'ARKANSAS':'AR',
    'ARIZONA':'AZ',
    'CALIFORNIA':'CA',
    'COLORADO':'CO',
    'CONNECTICUT':'CT',
    'DELAWARE':'DE',
    'FLORIDA':'FL',
    'GEORGIA':'GA',
    'HAWAII':'HI',
    'IOWA':'IA',
    'IDAHO':'ID',
    'ILLINOIS':'IL',
    'INDIANA':'IN',
    'KANSAS':'KS',
    'KENTUCKY':'KY',
    'LOUISIANA':'LA',
    'MASSACHUSETTS':'MA',
    'MARYLAND':'MD',
    'MAINE':'ME',
    'MICHIGAN':'MI',
    'MINNESOTA':'MN',
    'MISSOURI':'MO',
    'MISSISSIPPI':'MS',
    'MONTANA':'MT',
    'NORTH CAROLINA':'NC',
    'NORTH DAKOTA':'ND',
    'NEBRASKA':'NE',
    'NEW HAMPSHIRE':'NH',
    'NEW JERSEY':'NJ',
    'NEW MEXICO':'NM',
    'NEVADA':'NV',
    'NEW YORK':'NY',
    'OHIO':'OH',
    'OKLAHOMA':'OK',
    'OREGON':'OR',
    'PENNSYLVANIA':'PA',
    'RHODE ISLAND':'RI',
    'SOUTH CAROLINA':'SC',
    'SOUTH DAKOTA':'SD',
    'TENNESSEE':'TN',
    'TEXAS':'TX',
    'UTAH':'UT',
    'VIRGINIA':'VA',
    'VERMONT':'VT',
    'WASHINGTON':'WA',
    'WISCONSIN':'WI',
    'WEST VIRGINIA':'WV',
    'WYOMING': 'WY'
}

def clean_state(state: str):
    if state != state: 
        return state
    if state == 'UNITED STATES':
        return None
    clean = state
    if len(clean) != 2:
        clean = clean.replace(' AREA', '')
        clean = clean.replace(' METROPOLITAN', '')
        if clean in state_map:
            return state_map.get(clean)
    return clean

cleaned_salary_state_df  = salary_state_df.copy()
cleaned_salary_state_df['state'] = cleaned_salary_state_df['state'].apply(clean_state)
print(get_abnormal(cleaned_salary_state_df['state']))

3      NaN
19    None
dtype: object


In [81]:
fig_data = cleaned_salary_state_df[['state','avg_salary']].groupby('state').mean()
fig_data = fig_data.dropna(axis=1).sort_values(by='avg_salary')
fig = px.bar(fig_data,labels={
    "state": "State", 
    "value":"Average Salary"
    })
fig.show()