# Inputs & outputs
- **Inputs:** derived task vectors and salary simulations from `../final_data/s8_salary_data/` (e.g., `random_salary_dict_10000`).
- **Outputs:** aggregated salary impact simulations and summary stats feeding Supplementary Figure S17.

In [ ]:
import pandas as pd
import pickle
import numpy as np
from collections import defaultdict
from collections import Counter
import json
from tqdm import tqdm

In [None]:

data_path = '../final_data/s8_salary_data/'
df_task_rating = pd.read_excel(data_path + 'Task Ratings.xlsx')

def save_obj(obj, name, data_path_save = 'obj/'):
    with open(data_path_save + name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
        

def load_obj(name, data_path_load = 'obj/'):
    with open(data_path_load + name + '.pkl', 'rb') as f:
        return pickle.load(f)


category_dict = {str(i+1):i for i in range(7)}
job_task_bool = defaultdict(bool)
job_error_ft = {}
job_error_im = defaultdict(float)
job_error_rt = defaultdict(float)
non_count = 0

for soc_id, task_id, scale_id, category, data_value in zip(df_task_rating['O*NET-SOC Code'], df_task_rating['Task ID'], df_task_rating['Scale ID'], df_task_rating['Category'], df_task_rating['Standard Error']):
    if scale_id == 'FT':
        if not job_task_bool[(soc_id, str(task_id))]:
            job_task_bool[(soc_id, str(task_id))] = True
            job_error_ft[(soc_id, str(task_id))] = [0 for _ in range(7)]

        if pd.isna(data_value):
            non_count += 1
            job_error_ft[(soc_id, str(task_id))][category_dict[str(int(category))]] = 0
        else:
            job_error_ft[(soc_id, str(task_id))][category_dict[str(int(category))]] = data_value

    if scale_id == 'IM':
        job_error_im[(soc_id, str(task_id))] = data_value

    if scale_id == 'RT':
        job_error_rt[(soc_id, str(task_id))] = data_value

        

print(len(job_error_ft), len(job_error_im), len(job_error_rt))
print(non_count)

In [None]:
job_ftlist_temp = load_obj('job_ftvector_list', data_path)
job_tasklist_ft = load_obj('job_tasklist_ft', data_path)
missing_pairs = load_obj('missing_pairs', data_path)

job_ftlist_error = defaultdict(list)
for job, tasklist in job_tasklist_ft.items():
    for task in tasklist:
        job_ftlist_error[job].append(job_error_ft[(job, task)])

In [None]:

np.random.seed(43)

job_task_programming_score1 = load_obj("job_task_programming_score_gemini_0_5", data_path)
ps_dict1 = {'1':0, '2':0.125, '3':0.375, '4':0.625, '5':0.875}



job_task_programming_score2 = load_obj("job_task_programming_score_gemini_0_5_adjusted", data_path)
ps_dict2 = {'0':0, '1':0.055, '2':0.18, '3':0.38, '4':0.63, '5':0.88}



job_task_programming_score3 = load_obj('job_task_programming_score_0_100_from_gemini', data_path)
ps_temp = [ts for j,tsl in job_task_programming_score3.items() for t, ts in tsl.items()]
ps_dict3 = {v:int(v)/100 for v in set(ps_temp)}

jplist = [(job_task_programming_score1, ps_dict1), (job_task_programming_score2, ps_dict2), (job_task_programming_score3, ps_dict3)]

random_salary_dict = {}

for rdi in tqdm(range(10000)):
    random_salary_dict[rdi] = []
    job_ftlist = {}
    for job in job_ftlist_temp.keys():
        job_ftlist[job] = []
        for ftm, fte in zip(job_ftlist_temp[job], job_ftlist_error[job]):
            temp1 = np.random.normal(loc=ftm, scale=fte)
            temp = np.clip(temp1, a_min=0.0000001, a_max=None)
            job_ftlist[job].append(list(temp/np.sum(temp) * 100))

    for jppair in jplist:
        job_task_programming_score = jppair[0]
        ps_dict = jppair[1] 
        #############################################################################################
        #############################################################################################
        
        hour_vector = [0, 0.02*8, 0.05*8, 0.08*8, 0.10*8, 0.25*8, 0.5*8]
        
        soc_working_time_vector_per_day = {}
        for soc_id, ft_matrix in job_ftlist.items():
            ftm = np.array(ft_matrix)
            sumaxis0 = np.sum(ftm, axis = 0)
            for i in range(np.shape(sumaxis0)[0]):
                if sumaxis0[i] == 0:
                    #print(ftm)
                    #print(sumaxis0)
                    sumaxis0[i] += 0.00000001
                    
            ftm_normalized = ftm / sumaxis0
            
            soc_working_time_vector_per_day[soc_id] = ftm_normalized.dot(np.array(hour_vector).T) / 8
            if np.abs(np.sum(ftm_normalized.dot(np.array(hour_vector).T) / 8) - 1) > 0.01:
                soc_working_time_vector_per_day[soc_id] = ftm_normalized.dot(np.array(hour_vector).T) / np.sum(ftm_normalized.dot(np.array(hour_vector).T))
                
        
        save_obj(soc_working_time_vector_per_day, 'soc_working_time_vector_example_1_ft_normalized', data_path)
        
        
        
        hour_vector = [0.5, 1, 4, 48, 240, 480, 1920]
        
        soc_working_time_vector = {}
        for soc_id, ft_matrix in job_ftlist.items():
            temp = np.array(ft_matrix).dot(np.array(hour_vector).T)
            soc_working_time_vector[soc_id] = temp / np.sum(temp)
        
        
        save_obj(soc_working_time_vector, 'soc_working_time_vector_example_2_hour_vector', data_path)
        
        #############################################################################################
        #############################################################################################
        ##! BLS salary
        
        #############################################################################################
        #############################################################################################
        
        ##! BLS programming score
        
        def calculate_programming_score(soc_working_time_vector):
        
            job_programming_score_vector = {}
            job_programming_hour = {}
        
            for soc_id, tasklist in job_tasklist_ft.items():
                job_programming_score_vector[soc_id] = np.array([ps_dict[job_task_programming_score[soc_id][t]] for t in tasklist])
                job_programming_hour[soc_id] = soc_working_time_vector[soc_id].dot(job_programming_score_vector[soc_id])
        
        
            
            job_programming_hour_6digit_temp = defaultdict(list)
            for soc_id, h in job_programming_hour.items():
                job_programming_hour_6digit_temp[soc_id[:7]].append(h)
        
            job_programming_hour_6digit = {}
            for soc_id, hs in job_programming_hour_6digit_temp.items():
                job_programming_hour_6digit[soc_id] = np.mean(hs)
        
        
            job_programming_hour_6digit['13-1020'] = (job_programming_hour_6digit['13-1023'] + job_programming_hour_6digit['13-1021'] + job_programming_hour_6digit['13-1022'])/3
            del job_programming_hour_6digit['13-1023']
            del job_programming_hour_6digit['13-1021']
            del job_programming_hour_6digit['13-1022']
        
            job_programming_hour_6digit['13-2020'] = job_programming_hour_6digit['13-2023']
            del job_programming_hour_6digit['13-2023']
        
            job_programming_hour_6digit['29-2010'] = (job_programming_hour_6digit['29-2011'] + job_programming_hour_6digit['29-2012'])/2
            del job_programming_hour_6digit['29-2011']
            del job_programming_hour_6digit['29-2012']
        
        
            job_programming_hour_6digit['31-1120'] = (job_programming_hour_6digit['31-1121'] + job_programming_hour_6digit['31-1122'])/2
            del job_programming_hour_6digit['31-1121']
            del job_programming_hour_6digit['31-1122']
        
            job_programming_hour_6digit['39-7010'] = (job_programming_hour_6digit['39-7011'] + job_programming_hour_6digit['39-7012'])/2
            del job_programming_hour_6digit['39-7011']
            del job_programming_hour_6digit['39-7012']
        
            job_programming_hour_6digit['47-4090'] = (job_programming_hour_6digit['47-4091'] + job_programming_hour_6digit['47-4099'])/2
            del job_programming_hour_6digit['47-4091']
            del job_programming_hour_6digit['47-4099']
        
            job_programming_hour_6digit['51-2020'] = (job_programming_hour_6digit['51-2021'] + job_programming_hour_6digit['51-2022'] + job_programming_hour_6digit['51-2023'])/3
            del job_programming_hour_6digit['51-2021']
            del job_programming_hour_6digit['51-2022']
            del job_programming_hour_6digit['51-2023']
            
            job_programming_hour_6digit['51-2090'] = job_programming_hour_6digit['51-2092']
            del job_programming_hour_6digit['51-2092']
        
            return job_programming_hour_6digit
        
        soc_working_time_vector = load_obj('soc_working_time_vector_example_1_ft_normalized', data_path)
        job_programming_hour_6digit1 = calculate_programming_score(soc_working_time_vector)
        
        soc_working_time_vector = load_obj('soc_working_time_vector_example_2_hour_vector', data_path)
        job_programming_hour_6digit2 = calculate_programming_score(soc_working_time_vector)
        
        annual_salary_bls_adjust = load_obj('annual_salary_bls_adjust', data_path)
        hour_salary_bls_adjust = load_obj('hour_salary_bls_adjust', data_path)
        employment_count_bls = load_obj('employment_count_bls_adjust', data_path)

        empcomponent_rate = 1.449
        
        ##! 第一列
        salary_1 = sum([empcomponent_rate *pwh*annual_salary_bls_adjust[soc]*employment_count_bls[soc] for soc, pwh in job_programming_hour_6digit1.items() if soc in annual_salary_bls_adjust and soc in employment_count_bls])
        #print(salary_1)
        
        ##! 第三列
        salary_3 = sum([empcomponent_rate *pwh*annual_salary_bls_adjust[soc]*employment_count_bls[soc] for soc, pwh in job_programming_hour_6digit2.items() if soc in annual_salary_bls_adjust and soc in employment_count_bls])
        #print(salary_3)
        
        #############################################################################################
        #############################################################################################
        
        #############################################################################################
        #############################################################################################
        
        ##! ipums programming score
        
        def calculate_programming_score_ipums(soc_working_time_vector):
        
            job_programming_score_vector = {}
            job_programming_hour = {}
        
            for soc_id, tasklist in job_tasklist_ft.items():
                job_programming_score_vector[soc_id] = np.array([ps_dict[job_task_programming_score[soc_id][t]] for t in tasklist])
                job_programming_hour[soc_id] = soc_working_time_vector[soc_id].dot(job_programming_score_vector[soc_id])
        
        
            job_programming_hour_6digit_temp = defaultdict(list)
            for soc_id, h in job_programming_hour.items():
                job_programming_hour_6digit_temp[soc_id[:7]].append(h)
        
            job_programming_hour_6digit = {}
            for soc_id, hs in job_programming_hour_6digit_temp.items():
                job_programming_hour_6digit[soc_id] = np.mean(hs)
        
            df = pd.read_csv(data_path+'ipums_clean_all_filter_emp_x_salary.csv')
            ipums_job_list = [str(t)[:2] + '-' + str(t)[2:6] for t in df.OCCSOC]
        
            task_rating_job_list = list(job_programming_hour_6digit.keys())
        
            ipums_to_task_rating = {s:[] for s in ipums_job_list}
            task_rating_job_bool = {s:False for s in task_rating_job_list}
        
            for o in task_rating_job_list:
                if o in ipums_job_list:
                    ipums_to_task_rating[o].append(o)
                    task_rating_job_bool[o] = True
        
            len(task_rating_job_list), len(ipums_job_list)
        
        
            for oi in ipums_job_list:
                if oi[-3:] == '000':
                    for ob in task_rating_job_list:
                        if ob[:-3] == oi[:-3] and not task_rating_job_bool[ob]:
                            ipums_to_task_rating[oi].append(ob)
                            task_rating_job_bool[ob] = True
        
                elif oi[-1] == '0':
                    for ob in task_rating_job_list:
                        if ob[:-1] == oi[:-1] and not task_rating_job_bool[ob]:
                            ipums_to_task_rating[oi].append(ob)
                            task_rating_job_bool[ob] = True
        
                elif oi[-1] == 'X' and oi[-2]!='X':
                    for ob in task_rating_job_list:
                        if ob[:-1] == oi[:-1] and not task_rating_job_bool[ob]:
                            ipums_to_task_rating[oi].append(ob)
                            task_rating_job_bool[ob] = True
        
                elif oi[-2:] == 'XX' and oi[-3] != 'X' and oi[:-2] + 'YY' not in ipums_job_list:
                    for ob in task_rating_job_list:
                        if ob[:-2] == oi[:-2] and not task_rating_job_bool[ob]:
                            ipums_to_task_rating[oi].append(ob)
                            task_rating_job_bool[ob] = True
        
                elif oi[-3:] == 'XXX' and oi[:-3] + 'YYY' not in ipums_job_list:
                    for ob in task_rating_job_list:
                        if ob[:-3] == oi[:-3] and not task_rating_job_bool[ob]:
                            ipums_to_task_rating[oi].append(ob)
                            task_rating_job_bool[ob] = True
        
            ipums_job_not_assigned = [o for o,bs in ipums_to_task_rating.items() if len(bs)==0]
        
            ipums_to_task_rating_not_assigned_dict_temp = {
                '17-21XX':['17-2151', '17-2171'],
                '17-21YY':['17-2161', '17-2199'],
                '43-4XXX':['43-4021', '43-4151'],
                '43-4YYY':['43-4011', '43-4199'],
                '47-50YY':['47-5010', '47-5071'],
                '47-50XX':['47-5061', '47-5081']
            }
        
            for jipums, bs in ipums_to_task_rating_not_assigned_dict_temp.items():
                for b in bs:
                    if b in task_rating_job_bool and not task_rating_job_bool[b]:
                        ipums_to_task_rating[jipums].append(b)
                        task_rating_job_bool[b] = True
        
            
            ipums_to_task_rating['47-50YY'] = ['47-5011', '47-5012','47-5013', '47-5071']
            ipums_to_task_rating['47-50XX'] = ['47-5051', '47-5081']
            ipums_to_task_rating['19-40XX'] = ['19-4042', '19-4043', '19-4051', '19-4061', '19-4071']
            ipums_to_task_rating['19-40YY'] = ['19-4092', '19-4099']
        
            job_programming_hour_6digit_ipums = {}
            for j,v in ipums_to_task_rating.items():
                if len(v) > 0:
                    job_programming_hour_6digit_ipums[j] = sum([job_programming_hour_6digit[t] for t in v])/len(v)
        
            return job_programming_hour_6digit_ipums, ipums_to_task_rating
        
        soc_working_time_vector = load_obj('soc_working_time_vector_example_1_ft_normalized', data_path)
        job_programming_hour_6digit1, ipums_to_task_rating = calculate_programming_score_ipums(soc_working_time_vector)
        
        soc_working_time_vector = load_obj('soc_working_time_vector_example_2_hour_vector', data_path)
        job_programming_hour_6digit2, ipums_to_task_rating = calculate_programming_score_ipums(soc_working_time_vector)
        
        
        ipums_salaryXemp = load_obj('ipums_salaryXemp_dict',data_path)
        
        fixed_rate = 1.019518 * 1.449

        
        
        ##! 第二列
        salary_2 = sum([fixed_rate * v * ipums_salaryXemp[j] for j, v in job_programming_hour_6digit1.items()])
        #print(salary_2)
        
        ##! 第四列
        salary_4 = sum([fixed_rate * v * ipums_salaryXemp[j] for j, v in job_programming_hour_6digit2.items()])
        #print(salary_4)
    
        random_salary_dict[rdi].append([salary_1, salary_2, salary_3, salary_4])

save_obj(random_salary_dict, 'random_salary_dict_10000', data_path)

In [None]:
random_salary_dict = load_obj('random_salary_dict_10000', data_path)
data = np.array([m for m in random_salary_dict.values()])

In [None]:

# Define the dimensions of the data
n_matrices = 10000
rows = 3
cols = 4

confidence_level = 0.95
z_score = 1.96

means = np.mean(data, axis=0)
stds = np.std(data, axis=0)

sem = stds / np.sqrt(n_matrices)
margin_of_error = z_score * sem / 1000000000


print(margin_of_error)

In [None]:
means