In [1]:
# Selenium related imports
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# BeautifulSoup for web scraping
from bs4 import BeautifulSoup

# Standard Python libraries
import os
import re
import time
import pickle
import glob
from datetime import datetime

# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# Machine Learning and data processing
from scipy.cluster.hierarchy import linkage, leaves_list

# Geolocation services
from geopy.geocoders import Nominatim
from geopy.distance import geodesic

# Other specific libraries
from dotenv import load_dotenv
import openai
from tqdm import tqdm

# Custom module imports
from scrapifurs import utils
from scrapifurs.GPTinstructions import GPTinstructions


In [107]:
import numpy as np
import re

def parse_salary_info(in_list):
    in_list = ["" if pd.isna(item) else item for item in in_list]

    lower_salaries = []
    upper_salaries = []
    units = []

    for item in in_list:
        # Regular expression to find salary ranges and units
        matches = re.findall(r'\$([0-9]+(?:K)?)\s*\/\s*(yr|hr)', item)

        if matches:
            # Convert salary string to numerical value
            lower_salary = float(matches[0][0].replace('K', '000'))
            if len(matches) > 1:
                upper_salary = float(matches[1][0].replace('K', '000'))
            else:
                upper_salary = lower_salary

            lower_salaries.append(lower_salary)
            upper_salaries.append(upper_salary)
            units.append(matches[0][1])
        else:
            lower_salaries.append(np.nan)
            upper_salaries.append(np.nan)
            units.append('NA')

    return lower_salaries, upper_salaries, units



# Example usage
in_list = np.asarray(['$200K/yr - $350K/yr · 401(k), +3 benefits',
                      '$170K/yr · 4 benefits', 
                      '', 
                      '4 benefits',
                      'Medical benefit', 
                      '$185K/yr - $225K/yr · 401(k), +1 benefit',
                      '4 benefits', 
                      '$45/hr - $240K/yr · 8 benefits',
                      '4 benefits', 
                      '$45/hr· 8 benefits',
                      '$140K/yr - $200K/yr', 
                      ''], dtype=object)

lower_salaries, upper_salaries, units = parse_salary_info(in_list)
print(lower_salaries)
print(upper_salaries)
print(units)




# def parse_salary_info(in_list):
#     in_list = ["" if pd.isna(item) else item for item in in_list]
    
#     pay_min = []
#     pay_max = []
#     pay_time_unit = []
#     for x in in_list:
#         print(x)
#         if '$' in x:
#             pay_all = []
#             pay_time_unit_tmp = []
#             pay_split_all = [k.lower() for k in x.split('$') if len(k)>0]
    
#             for pay_split in pay_split_all:
#                 pay_split_each = pay_split.split('/')
#                 pay_time_unit_tmp.append(pay_split_each[1].split(' ')[0])
#                 if pay_split_each[0][-1] == 'k':
#                     pay_all.append(1000*int(pay_split_each[0][:-1]))
#                 else:
#                     pay_all.append(int(re.findall('\d+', pay_split_each[0])[0]))
    
#             pay_min.append(pay_all[0])
#             pay_max.append(pay_all[-1])
#             pay_time_unit.append(pay_time_unit_tmp[0][:2])
#         else:
#             pay_min.append(np.nan)
#             pay_max.append(np.nan)
#             pay_time_unit.append('NA')
    
#     return pay_min, pay_max, pay_time_unit

[200000.0, 170000.0, nan, nan, nan, 185000.0, nan, 45.0, nan, 45.0, 140000.0, nan]
[350000.0, 170000.0, nan, nan, nan, 225000.0, nan, 240000.0, nan, 45.0, 200000.0, nan]
['yr', 'yr', 'NA', 'NA', 'NA', 'yr', 'NA', 'hr', 'NA', 'hr', 'yr', 'NA']


In [162]:
# hr_index_multiply = 1+np.asarray(job_data['units']=='hr')*(hour_multiply_by-1)




In [161]:
job_data[job_data['units']=='hr']

Unnamed: 0,job_ids,is_promoted,job_title,company_name,Location,pay,job_link,time_added,about_the_job,0_new__1_applied__2_skipped,lower_salaries,upper_salaries,units,yearly_lower_salaries,yearly_upper_salaries
105,3733577556,True,Neurodiagnostic Tech III PD,Cottage Health,"Santa Barbara, CA (On-site)",$57/hr,https://www.linkedin.com/jobs/view/3733577556/...,20231113234021,Job Description\n\nThe Neurodiagnostic Tech II...,-1,57.0,57.0,hr,102600.0,102600.0
136,3584740618,True,Sleep EEG Supervisor,Children's Hospital Colorado,"Aurora, CO (On-site)","$32/hr - $48.20/hr · Vision, +7 benefits",https://www.linkedin.com/jobs/view/3584740618/...,20231113234124,"Why Work at Children's.... \n\nHere, it’s diff...",-1,32.0,32.0,hr,57600.0,57600.0
143,3763034816,True,Neurodiagnostic Technologist I - Sleep Lab,UNC Health,"Lenoir, NC (On-site)",$18/hr - $22.60/hr,https://www.linkedin.com/jobs/view/3763034816/...,20231113234133,Description\n\nBecome part of an inclusive org...,-1,18.0,18.0,hr,32400.0,32400.0
155,3760190358,True,Research Fellow- Spine (2024-2025),Gillette Children's,"St Paul, MN (On-site)",$22/hr,https://www.linkedin.com/jobs/view/3760190358/...,20231113234215,Research fellowship opportunity for current me...,-1,22.0,22.0,hr,39600.0,39600.0


In [170]:
import pandas as pd

# Load the provided Excel file
file_path = '/Users/phil/Library/CloudStorage/Dropbox/GITHUB/scrapifurs/scrapifurs/data/data_by_date/all_searches_job_data_NEUROPHYSIOLOGY_2023_11_13_23-35.xlsx'
job_data = pd.read_excel(file_path)

# job_data['about_the_job'] = job_data['about_the_job'].str.lower()


lower_salaries, upper_salaries, units = parse_salary_info(job_data['pay'])
job_data['lower_salaries'] = lower_salaries
job_data['upper_salaries'] = upper_salaries
job_data['units'] = units


job_data['yearly_lower_salaries'] = lower_salaries
job_data['yearly_upper_salaries'] = upper_salaries

def convert_hr_to_yr(x):
    return x*1700
    
c = job_data['units']=='hr'
job_data.loc[c, 'yearly_lower_salaries'] = convert_hr_to_yr(job_data.loc[c, 'lower_salaries'])
job_data.loc[c, 'yearly_upper_salaries'] = convert_hr_to_yr(job_data.loc[c, 'upper_salaries'])



# Check the first few rows of the dataframe to understand its structure
job_data.head()


Unnamed: 0,job_ids,is_promoted,job_title,company_name,Location,pay,job_link,time_added,about_the_job,0_new__1_applied__2_skipped,lower_salaries,upper_salaries,units,yearly_lower_salaries,yearly_upper_salaries
0,3694415132,True,Neurologist,NYC Traumatic Brain Injury Center,"Queens, NY (On-site)","$200K/yr - $350K/yr · 401(k), +3 benefits",https://www.linkedin.com/jobs/view/3694415132/...,20231113233618,We are a Traumatic Brain Injury Center with me...,-1,200000.0,350000.0,yr,200000.0,350000.0
1,3728656705,True,Statistical Scientist,CARGO Therapeutics,San Francisco Bay Area (On-site),$124K/yr - $170K/yr · 4 benefits,https://www.linkedin.com/jobs/view/3728656705/...,20231113233620,Description\nThe Statistical Scientist will be...,-1,124000.0,170000.0,yr,124000.0,170000.0
2,3683071683,True,Postdoctoral Scholar,Oregon Health & Science University,"Portland, OR (On-site)",,https://www.linkedin.com/jobs/view/3683071683/...,20231113233622,Department Overview\n\nThis is a postdoctoral ...,-1,,,,,
3,3718205961,True,MRI Scientist,neuro42,"San Francisco, CA (On-site)",4 benefits,https://www.linkedin.com/jobs/view/3718205961/...,20231113233623,The next evolution of MRI is here!\n\nneuro42 ...,-1,,,,,
4,3674560098,False,RESEARCH FELLOW,University of Michigan,"Ann Arbor, MI (On-site)",Medical benefit,https://www.linkedin.com/jobs/view/3674560098/...,20231113233625,How to Apply\n\nPlease send your queries or ap...,-1,,,,,


In [167]:
filtered_data = job_data[
(job_data['yearly_lower_salaries']<= 140000) & 
(job_data['yearly_lower_salaries']>= 80000) &
(job_data['about_the_job'].contains('neuro'))
]

In [168]:
filtered_data

Unnamed: 0,job_ids,is_promoted,job_title,company_name,Location,pay,job_link,time_added,about_the_job,0_new__1_applied__2_skipped,lower_salaries,upper_salaries,units,yearly_lower_salaries,yearly_upper_salaries
1,3728656705,True,Statistical Scientist,CARGO Therapeutics,San Francisco Bay Area (On-site),$124K/yr - $170K/yr · 4 benefits,https://www.linkedin.com/jobs/view/3728656705/...,20231113233620,Description\nThe Statistical Scientist will be...,-1,124000.0,170000.0,yr,124000.0,170000.0
8,3747068806,True,Senior Machine Learning Scientist - (Medical I...,Lawrence Harvey,San Francisco Bay Area (Remote),$140K/yr - $200K/yr,https://www.linkedin.com/jobs/view/3747068806/...,20231113233630,Well funded series B medical imaging company w...,-1,140000.0,200000.0,yr,140000.0,200000.0
15,3742651043,True,"Applied Scientist, Sponsored Products",Amazon,"Santa Monica, CA",$136K/yr - $223K/yr,https://www.linkedin.com/jobs/view/3742651043/...,20231113233638,Description\n\nAmazon is investing heavily in ...,-1,136000.0,223000.0,yr,136000.0,223000.0
16,3737719174,False,Development Engineer,Abbott,"Plano, TX","$71K/yr - $143K/yr · 401(k), +1 benefit",https://www.linkedin.com/jobs/view/3737719174/...,20231113233640,Abbott is a global healthcare leader that help...,-1,71000.0,143000.0,yr,71000.0,143000.0
17,3759416561,True,Surgical Neurophysiologist,Accurate Neuromonitoring,New York City Metropolitan Area (On-site),"$80K/yr - $125K/yr + Bonus, Overtime · 7 benefits",https://www.linkedin.com/jobs/view/3759416561/...,20231113233640,Accurate Neuromonitoring is seeking fully trai...,-1,80000.0,125000.0,yr,80000.0,125000.0
18,3725708843,True,"Research Scientist, Neural Interfaces (PhD)",Oculus VR,"Burlingame, CA",$116K/yr - $168K/yr,https://www.linkedin.com/jobs/view/3725708843/...,20231113233642,Reality Labs at Meta is seeking Research Scien...,-1,116000.0,168000.0,yr,116000.0,168000.0
25,3712796731,True,Applied Scientist,Amazon,"Los Angeles, CA",$136K/yr - $223K/yr,https://www.linkedin.com/jobs/view/3712796731/...,20231113233719,Description\n\nThe Alexa team is looking for a...,-1,136000.0,223000.0,yr,136000.0,223000.0
27,3763234441,True,"Research Scientist, Neural Interfaces (PhD)",Oculus VR,"Burlingame, CA",$116K/yr - $168K/yr,https://www.linkedin.com/jobs/view/3763234441/...,20231113233721,Reality Labs at Meta is seeking Research Scien...,-1,116000.0,168000.0,yr,116000.0,168000.0
36,3724809400,True,Senior Research Scientist-Medical Imaging,Philips,"Orange, OH",$107K/yr - $199K/yr,https://www.linkedin.com/jobs/view/3724809400/...,20231113233732,Job Title\nSenior Research Scientist-Medical I...,-1,107000.0,199000.0,yr,107000.0,199000.0
40,3751175227,True,Analytic Science - Scientist II,FICO,San Diego Metropolitan Area,$102K/yr - $160K/yr,https://www.linkedin.com/jobs/view/3751175227/...,20231113233737,FICO (NYSE: FICO) is a leading global analytic...,-1,102000.0,160000.0,yr,102000.0,160000.0


In [128]:
# job_data.iloc([job_data['lower_salaries']<=140000 & job_data['units']=='yr'])

# Corrected filtering
filtered_data = job_data[
(job_data['yearly_lower_salaries'] <= 140000) & (job_data['units'] == 'yr') | 
(job_data['upper_salaries'] >= 60) & (job_data['units'] == 'hr')
] 

# Display the filtered DataFrame
len(filtered_data)
np.shape(filtered_data)

(47, 13)

In [149]:
job_data[job_data['units'] == 'hr']

Unnamed: 0,job_ids,is_promoted,job_title,company_name,Location,pay,job_link,time_added,about_the_job,0_new__1_applied__2_skipped,lower_salaries,upper_salaries,units,yearly_lower_salaries,yearly_upper_salaries
105,3733577556,True,Neurodiagnostic Tech III PD,Cottage Health,"Santa Barbara, CA (On-site)",$57/hr,https://www.linkedin.com/jobs/view/3733577556/...,20231113234021,Job Description\n\nThe Neurodiagnostic Tech II...,-1,57.0,57.0,hr,3990.0,3990.0
136,3584740618,True,Sleep EEG Supervisor,Children's Hospital Colorado,"Aurora, CO (On-site)","$32/hr - $48.20/hr · Vision, +7 benefits",https://www.linkedin.com/jobs/view/3584740618/...,20231113234124,"Why Work at Children's.... \n\nHere, it’s diff...",-1,32.0,32.0,hr,2240.0,2240.0
143,3763034816,True,Neurodiagnostic Technologist I - Sleep Lab,UNC Health,"Lenoir, NC (On-site)",$18/hr - $22.60/hr,https://www.linkedin.com/jobs/view/3763034816/...,20231113234133,Description\n\nBecome part of an inclusive org...,-1,18.0,18.0,hr,1260.0,1260.0
155,3760190358,True,Research Fellow- Spine (2024-2025),Gillette Children's,"St Paul, MN (On-site)",$22/hr,https://www.linkedin.com/jobs/view/3760190358/...,20231113234215,Research fellowship opportunity for current me...,-1,22.0,22.0,hr,1540.0,1540.0


In [22]:
np.asarray(job_data['pay'][:10])

array(['$200K/yr - $350K/yr · 401(k), +3 benefits',
       '$124K/yr - $170K/yr · 4 benefits', nan, '4 benefits',
       'Medical benefit', '$185K/yr - $225K/yr · 401(k), +1 benefit',
       '4 benefits', '$175K/yr - $240K/yr · 8 benefits',
       '$140K/yr - $200K/yr', nan], dtype=object)

In [25]:
in_list = np.asarray(['$200K/yr - $350K/yr · 401(k), +3 benefits',
       '$124K/yr - $170K/yr · 4 benefits', np.nan, '4 benefits',
       'Medical benefit', '$185K/yr - $225K/yr · 401(k), +1 benefit',
       '4 benefits', '$175K/yr - $240K/yr · 8 benefits',
       '$140K/yr - $200K/yr', np.nan], dtype=object)


In [95]:


for item in in_list:
    matches = re.findall(r'\$([0-9]+(?:k)?)\s*\/\s*(yr|hr)', item.lower())
    print(matches)
    # asdf





[('200k', 'yr'), ('350k', 'yr')]
[('170k', 'yr')]
[]
[]
[]
[('185k', 'yr'), ('225k', 'yr')]
[]
[('45', 'hr'), ('240k', 'yr')]
[]
[('45', 'hr')]
[('140k', 'yr'), ('200k', 'yr')]
[]


In [None]:
matches[0][0].replace('K', '000'))

In [92]:
for k in matches:
    print(k)

('200k', 'yr')
('350k', 'yr')


[200000, 170000, nan, nan, nan, 185000, nan, 45, nan, 45, 140000, nan]
[350000, 170000, nan, nan, nan, 225000, nan, 240000, nan, 45, 200000, nan]
['yr', 'yr', 'NA', 'NA', 'NA', 'yr', 'NA', 'hr', 'NA', 'hr', 'yr', 'NA']


In [81]:

pay_min, pay_max, pay_time_unit
# pay_time_unit_tmp[0][:1]
# int(re.findall('\d+', pay_split_each[0]))

# re.findall('\d+', pay_split_each[0])[0]
# pay_time_unit_tmp


((nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan),
 (nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan),
 ('NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA'))

In [38]:
None if np.isnan(x) else x


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [27]:
import numpy as np
import re

in_list = np.asarray(['$200K/yr - $350K/yr · 401(k), +3 benefits',
       '$170K/yr · 4 benefits', 
                      np.nan, 
                      '4 benefits',
       'Medical benefit', '$185K/yr - $225K/yr · 401(k), +1 benefit',
       '4 benefits', '$45/hr - $240K/yr · 8 benefits',
                      '4 benefits', '$45/hr· 8 benefits',
       '$140K/yr - $200K/yr', 
                      np.nan], dtype=object)

min_pay = []
max_pay = []
pay_type = []

for x in in_list:
    if isinstance(x, str) and '$' in x:
        pay_values = re.findall(r'\$([0-9]+)K', x)
        pay_values = [int(val) * 1000 for val in pay_values]  # Convert K to actual number
        pay_type_value = re.search(r'/yr', x)

        if len(pay_values) == 1:
            min_pay.append(pay_values[0])
            max_pay.append(pay_values[0])
        elif len(pay_values) > 1:
            min_pay.append(pay_values[0])
            max_pay.append(pay_values[1])
        else:
            min_pay.append(None)
            max_pay.append(None)

        pay_type.append(pay_type_value.group() if pay_type_value else None)
    else:
        min_pay.append(None)
        max_pay.append(None)
        pay_type.append(None)

# Results
print("Min Pay:", min_pay)
print("Max Pay:", max_pay)
print("Pay Type:", pay_type)


Min Pay: [200000, 170000, None, None, None, 185000, None, 240000, None, None, 140000, None]
Max Pay: [350000, 170000, None, None, None, 225000, None, 240000, None, None, 200000, None]
Pay Type: ['/yr', '/yr', None, None, None, '/yr', None, '/yr', None, None, '/yr', None]
