## Compute the Average Gradients of the Healthcare Tasks


In [1]:
# encoding=utf8
import os
import numpy as np
import pandas as pd
import getpass
import pickle
import string
import matplotlib.pyplot as plt
from matplotlib import rc
rc('font',**{'family': 'serif', 'serif':"Computer Modern Roman"})
rc('text', usetex=True)
%matplotlib inline


In [2]:
## # Input DATA set of observed Occupations and Tasks
datasets = '/home/'+ getpass.getuser() +'/Datasets/'
d = os.path.join(datasets, 'FoHealthcare/Final_dataset_06112018.xlsx')

excel_doc = pd.ExcelFile(d)
dataset = excel_doc.parse(sheet_name='ExpandedTasks')
dataset = dataset[dataset['clinical'] != 1]   # # REMOVE CLINICAL TASKS: 
print("All Task-Occu pairs: %s" % str(dataset.shape[0]))


task_dataset = excel_doc.parse(sheet_name='Tasks')
task_dataset = task_dataset[task_dataset['clinical'] != 1]
print("unique clerical tasks dataset:", task_dataset.shape)

All Task-Occu pairs: 264
unique clerical tasks dataset: (108, 23)


In [3]:
keep_hc_cols = ['Observed Occupation', 'Task', 'DWA Task', 'DWA ID', 'Task Weight', 'Automation Scores',
       'Weighted Average Automation Score']
HC_data = dataset[keep_hc_cols]
task_dataset = task_dataset[keep_hc_cols]
print("Tasks dataset: %s" % len(task_dataset["Task"]))
print("HC data: %s" % len(HC_data["Task"]))
unique_occupations = dataset["Observed Occupation"].unique()
HC_data.head()

Tasks dataset: 108
HC data: 264


Unnamed: 0,Observed Occupation,Task,DWA Task,DWA ID,Task Weight,Automation Scores,Weighted Average Automation Score
0,Administrator,Address problems that arise with building,"Notify others of emergencies, problems, or haz...",4.A.4.a.2.I08.D07; 4.A.4.a.2.I08.D04; 4.A.4.a....,0.20; 0.20; 0.20; 0.20; 0.20;,2.92; 3.08; 3.52; 3.29; 2.72;,3.105316
1,Administrator,Answer phone,Answer telephones to direct calls or provide i...,4.A.4.a.3.I03.D11,1.00;,3.36;,3.361812
2,Administrator,Checking for errors in paperwork,Check data for recording errors.,4.A.2.a.2.I01.D08,1.00;,3.33;,3.329314
3,Administrator,Cleaning up information in the patients electr...,Process healthcare paperwork.,4.A.4.c.1.I01.D03,1.00;,2.78;,2.782653
4,Administrator,Connecting human resources/making introduction...,Relay information between personnel.,4.A.4.a.2.I03.D11,1.00;,3.27;,3.268373


In [4]:
def risk_cat(score):
    return int(np.floor(score))

# Add the risk category as a new column
task_dataset["cat_n"] = task_dataset["Weighted Average Automation Score"].apply(lambda col: risk_cat(col))
low = task_dataset[task_dataset["cat_n"] == 1]
med = task_dataset[task_dataset["cat_n"] == 2]
high = task_dataset[task_dataset["cat_n"] == 3]
print("These are our sets to calcluate gradients over: ", low.shape, med.shape, high.shape)
low

These are our sets to calcluate gradients over:  (6, 8) (55, 8) (47, 8)


Unnamed: 0,Observed Occupation,Task,DWA Task,DWA ID,Task Weight,Automation Scores,Weighted Average Automation Score,cat_n
48,Healthcare Assistant,Print out lab test labels in ICE,Apply identification labels or tags.,4.A.1.b.1.I01.D01,1.00;,1.76;,1.76017,1
61,Phlebotomist,Label blood vials,Apply identification labels or tags.,4.A.1.b.1.I01.D01,1.00;,1.76;,1.76017,1
68,Practice Manager,Manage finances for the practice including pay...,Manage organizational or program finances.; Ma...,4.A.4.b.4.I09.D08; 4.A.4.b.4.I09.D06,0.50; 0.50;,1.44; 2.23;,1.832718,1
72,Practice Manager,Staff recruitment,Recruit personnel.,4.A.4.c.2.I01.D05,1.00;,1.82;,1.824356,1
75,Practice Manager,Writing and updating policies,Evaluate effectiveness of personnel policies o...,4.A.2.a.1.I02.D04,1.00;,1.96;,1.958048,1
80,Practice Manager,Having practice staff take online training,Coordinate training activities.; Direct employ...,4.A.4.b.4.I07.D01; 4.A.4.b.4.I07.D03,0.50; 0.50;,1.16; 2.66;,1.907981,1


In [5]:
infile = open('aies_data/average_gradients.pkl','rb')
average_gradients = pickle.load(infile)
infile.close()

infile = open('aies_data/dwa_order.pkl','rb')
dwa_order = pickle.load(infile)
infile.close()

infile = open('aies_data/all_pointwise_gradients.pkl','rb')
pointwise_gradients = pickle.load(infile)
infile.close()

infile = open('aies_data/feature_order.pkl','rb')
feature_order = pickle.load(infile)
infile.close()

# Load all the gradient information from AIES repo ... 
print(average_gradients.shape, pointwise_gradients.shape, feature_order[:10], dwa_order[:5])

dwa_grads = pd.DataFrame(data = pointwise_gradients, columns=feature_order)
dwa_grads["DWA ID"] = pd.Series([i[0] for i in dwa_order])

(120,) (2067, 120) ['Active Learning', 'Active Listening', 'Complex Problem Solving', 'Coordination', 'Critical Thinking', 'Equipment Maintenance', 'Equipment Selection', 'Installation', 'Instructing', 'Judgment and Decision Making'] [['4.A.1.a.1.I01.D01']
 ['4.A.1.a.1.I01.D04']
 ['4.A.1.a.1.I02.D08']
 ['4.A.1.a.1.I02.D09']
 ['4.A.1.a.1.I02.D10']]


In [6]:
with open("aies_data/ibcc_train_test_data.pkl", 'rb') as f:
    data = pickle.load(f)
f.close()

X, y = data[0].iloc[:, 1:-2].values, data[2].iloc[:, 0].values
X_test = data[1].iloc[:, 1:].values
y = np.round(y * 2.) / 2.
features = list(data[0].iloc[:, 1:-2].columns)

X_all = np.vstack([X, X_test])
dwas = np.vstack([data[0].iloc[:, 0].values.reshape(-1, 1), data[1].iloc[:, 0].values.reshape(-1, 1)])

dwa_feats = pd.DataFrame(data=X_all, columns=feature_order)
dwa_feats["DWA ID"] = pd.Series([i[0] for i in dwa_order])
dwa_feats.head()

Unnamed: 0,Active Learning,Active Listening,Complex Problem Solving,Coordination,Critical Thinking,Equipment Maintenance,Equipment Selection,Installation,Instructing,Judgment and Decision Making,...,Stamina,Static Strength,Time Sharing,Trunk Strength,Visual Color Discrimination,Visualization,Wrist-Finger Speed,Written Comprehension,Written Expression,DWA ID
0,3.206031,3.798952,3.308485,3.255014,3.465971,1.352048,1.586345,1.043563,2.842514,3.411688,...,1.153341,1.217814,2.52091,1.451737,3.352828,3.520535,1.413655,3.788355,3.342607,4.A.1.a.1.I01.D01
1,3.241681,3.920024,3.189726,3.530566,3.621624,1.091567,1.192623,1.053746,3.022365,3.459405,...,1.839739,1.700592,2.768509,1.993082,2.454878,2.957088,1.224493,3.841295,3.43758,4.A.1.a.1.I01.D04
2,3.15153,3.621651,3.387262,3.101375,3.626099,1.573449,1.762136,1.336109,2.818544,3.308758,...,1.301533,1.415211,2.663005,1.708003,2.827332,3.327724,1.693029,3.825821,3.568996,4.A.1.a.1.I02.D08
3,2.692799,3.073483,2.878213,3.096213,3.190303,2.117181,2.339092,1.657604,2.499585,2.945194,...,2.835105,3.150562,2.62212,3.293398,2.718646,3.235995,2.032858,2.93504,2.543448,4.A.1.a.1.I02.D09
4,2.791359,3.322016,3.040584,3.132688,3.338834,2.130404,1.735284,1.215225,2.635423,3.108596,...,2.295489,2.602196,2.747473,2.55115,2.789201,2.908992,2.024323,3.251757,3.05246,4.A.1.a.1.I02.D10


In [7]:
low.head()

Unnamed: 0,Observed Occupation,Task,DWA Task,DWA ID,Task Weight,Automation Scores,Weighted Average Automation Score,cat_n
48,Healthcare Assistant,Print out lab test labels in ICE,Apply identification labels or tags.,4.A.1.b.1.I01.D01,1.00;,1.76;,1.76017,1
61,Phlebotomist,Label blood vials,Apply identification labels or tags.,4.A.1.b.1.I01.D01,1.00;,1.76;,1.76017,1
68,Practice Manager,Manage finances for the practice including pay...,Manage organizational or program finances.; Ma...,4.A.4.b.4.I09.D08; 4.A.4.b.4.I09.D06,0.50; 0.50;,1.44; 2.23;,1.832718,1
72,Practice Manager,Staff recruitment,Recruit personnel.,4.A.4.c.2.I01.D05,1.00;,1.82;,1.824356,1
75,Practice Manager,Writing and updating policies,Evaluate effectiveness of personnel policies o...,4.A.2.a.1.I02.D04,1.00;,1.96;,1.958048,1


In [8]:
dwa_feats[dwa_feats["DWA ID"]=="4.A.4.b.4.I09.D06"]

Unnamed: 0,Active Learning,Active Listening,Complex Problem Solving,Coordination,Critical Thinking,Equipment Maintenance,Equipment Selection,Installation,Instructing,Judgment and Decision Making,...,Stamina,Static Strength,Time Sharing,Trunk Strength,Visual Color Discrimination,Visualization,Wrist-Finger Speed,Written Comprehension,Written Expression,DWA ID
1880,3.38,3.75,4.0,3.12,4.25,1.0,1.0,1.0,2.88,4.25,...,1.0,1.0,2.5,1.25,2.12,2.75,1.12,3.88,3.5,4.A.4.b.4.I09.D06


In [9]:
# # # These are the column name groupings... (used in the latex outputs)
cols = feature_order
skills = cols[:35]
knowledges = cols[35:68]
abilities = cols[68:]

characteristic_dict = {}
for i in skills:
    characteristic_dict[i] = "skill"
for i in knowledges:
    characteristic_dict[i] = "knowledge"
for i in abilities:
    characteristic_dict[i] = "ability"
# characteristic_dict


def print_top(col_name, cap):
     print("\\begin{table}[h!] \
    \\caption{%s} \
    \\centering \\small \
    \\begin{tabular}{@{}lc@{}} \\toprule{}  \
    O*NET Feature & %s  \\\\  \\midrule" % (cap, col_name))

def print_bot(label):
    print("\\bottomrule \
    \\end{tabular} \
    \\label{tab:%s} \
    \\end{table}" % label)
    return

In [10]:

# number_of_features_presented = 5

# for subset_of_data, risk in zip([low, high, med], ["not-automatable", "automatable", "partly-automatable"]):
#     print("\n %%", risk, subset_of_data.shape)
    
#     avg_features, dwa_grads_per_set = [],[]
#     for index, row in subset_of_data.iterrows():
#         dwas = [i.strip() for i in row['DWA ID'].split(";")]
# #         print(dwas)

#         weights = [i.strip() for i in row['Task Weight'].split(";")]
        
#         feat_subset = dwa_feats[dwa_feats["DWA ID"].isin(dwas)][cols].values  # FYI: this is not ordered the same as "dwas"
#         grad_subset = dwa_grads[dwa_grads["DWA ID"].isin(dwas)][cols].values  # FYI: this is not ordered the same as "dwas" 
# #         print(grad_subset)
# #         print(feat_subset)
    
#         avg_features.append(feat_subset.mean(axis=0)) 
#         dwa_grads_per_set.append(grad_subset.mean(axis=0)) 
        
#     average_grads = np.vstack(dwa_grads_per_set).mean(axis=0)
#     avg_features =  np.vstack(avg_features).mean(axis=0)


#     # # These are the characteristics of those tasks (in the sub-set)
#     neg_grads  = [(col, grad) for col, grad in sorted(zip(average_grads, cols))][:number_of_features_presented]
# #     poss_grads = [(col, grad) for col, grad in sorted(zip(average_grads, cols))][-number_of_features_presented:]  ## This means they are sorted differently (but gives you the same set of 10)
#     poss_grads = [(col, grad) for col, grad in sorted(zip(average_grads, cols), reverse=True)][:number_of_features_presented]

    
#     ### Print the results to latex:
#     print("\n %% AVERAGE FEATURE DIFFERENCES:")
#     caption = "Largest feature differences relative to the population for \\textcolor{vubbleu}{%s} healthcare tasks" % risk
#     print_top("Feature Difference", caption)
#     mean_dataset_features = dwa_feats.mean(axis=0)
#     percentage_diffs = ((avg_features - mean_dataset_features) / mean_dataset_features)*100
    
#     # # REMOVED THE NEGATIVE FEATURES
# #     for i, j in sorted(zip(percentage_diffs, cols))[:number_of_features_presented]:
# #         print(j , "(%s) & \\SI{%0.1f}{\\percent}  \\\\ " % (characteristic_dict[j], i)) 
#     for i, j in sorted(zip( percentage_diffs, cols), reverse=True)[:number_of_features_presented]:
#         print(j , "(%s) & +\\SI{%0.1f}{\\percent}  \\\\ " % (characteristic_dict[j], i)) 
#     print("\\midrule")
#     label = "tab:feature_diff_%s" % risk
#     print_bot(label)
        
    
#     print("\n %% AVERAGE FEATURE GRADIENTS:")    
#     caption = "O*NET features with the largest (possitive and negative) derivatives for \\textcolor{vubbleu}{%s} healthcare tasks." % risk
#     print_top("Feature Gradient", caption)
    
#  # # REMOVED THE NEGATIVE FEATURES
# #     for (i, j) in neg_grads:
# #         print(j , "(%s) & %0.3f  \\\\ " % (characteristic_dict[j], i)) 
    
#     for (i, j) in poss_grads:
#         print(j , "(%s) & +%0.3f  \\\\ " % (characteristic_dict[j], i))
#     print("\\midrule")
#     label = "tab:feature_grad_%s" % risk    
#     print_bot(label)

## Just two tables instead of threex2:

In [11]:
def print_top2(col_name, cap):
     print("\\begin{table}[h!] \
    \\caption{%s} \
    \\centering \\small \
    \\begin{tabular}{@{}llc@{}} \\toprule{}  \
    \\textbf{Risk Category} & \\textbf{O*NET Feature} & \\textbf{%s}  \\\\  \\midrule" % (cap, col_name))
        

In [12]:
print("%% taken from Healthcare_DWA_Gradients.ipynb,\n")

number_of_features_presented = 5

caption = "Largest feature differences relative to the dataset by risk category"
print_top2("Feature Difference", caption)
for cnt, (subset_of_data, risk) in enumerate(zip([low, high, med ], ["not-automatable", "automatable", "partly-automatable"])):
    print("\n %%", risk, subset_of_data.shape)
    
    avg_features, dwa_grads_per_set = [],[]
    for index, row in subset_of_data.iterrows():
        dwas = [i.strip() for i in row['DWA ID'].split(";")]
#         print(dwas)


        weights = [i.strip() for i in row['Task Weight'].split(";")]
        feat_subset = dwa_feats[dwa_feats["DWA ID"].isin(dwas)][cols].values  # FYI: this is not ordered the same as "dwas"
        grad_subset = dwa_grads[dwa_grads["DWA ID"].isin(dwas)][cols].values  # FYI: this is not ordered the same as "dwas" 
#         print(grad_subset)
#         print(feat_subset)
    
        avg_features.append(feat_subset.mean(axis=0)) 
        dwa_grads_per_set.append(grad_subset.mean(axis=0)) 
        
    average_grads = np.vstack(dwa_grads_per_set).mean(axis=0)
    avg_features =  np.vstack(avg_features).mean(axis=0)


    # # These are the characteristics of those tasks (in the sub-set)
    neg_grads  = [(col, grad) for col, grad in sorted(zip(average_grads, cols))][:number_of_features_presented]
#     poss_grads = [(col, grad) for col, grad in sorted(zip(average_grads, cols))][-number_of_features_presented:]  ## This means they are sorted differently (but gives you the same set of 10)
    poss_grads = [(col, grad) for col, grad in sorted(zip(average_grads, cols), reverse=True)][:number_of_features_presented]

    print("\n %% AVERAGE FEATURE DIFFERENCES:")
    mean_dataset_features = dwa_feats.mean(axis=0)
    percentage_diffs = ((avg_features - mean_dataset_features) / mean_dataset_features)*100
    for ind, (i, j) in enumerate(sorted(zip( percentage_diffs, cols), reverse=True)[:number_of_features_presented]):
        if ind==0:
            print(risk, " & ", j , "(%s) & +\\SI{%0.1f}{\\percent}  \\\\ " % (characteristic_dict[j], i)) 
        else:
            print(" & ",j , "(%s) & +\\SI{%0.1f}{\\percent}  \\\\ " % (characteristic_dict[j], i))   

#         REMOVE NEGATIVE FEATURES
#     print("\\cdashline{2-3}")       
#     for i, j in sorted(zip(percentage_diffs, cols))[:number_of_features_presented]:
#             print(" & ", j , "(%s) & \\SI{%0.1f}{\\percent}  \\\\ " % (characteristic_dict[j], i)) 

    if cnt != 2: print("\\midrule")
label = "feature_diffs"
print_bot(label)

print("\n")

 # # #  JUST THE SAME LOOP AGAIN BECUASE IM LAZY...
 
# caption = "O*NET features with the largest average derivatives by risk category"
# print_top2("Feature Gradient", caption)
# for cnt, (subset_of_data, risk) in enumerate(zip([low, high, med ], ["not-automatable", "automatable", "partly-automatable"])):
#     print("\n %%", risk, subset_of_data.shape)
    
#     avg_features, dwa_grads_per_set = [],[]
#     for index, row in subset_of_data.iterrows():
#         dwas = [i.strip() for i in row['DWA ID'].split(";")]
#         weights = [i.strip() for i in row['Task Weight'].split(";")]
       
#         feat_subset = dwa_feats[dwa_feats["DWA ID"].isin(dwas)][cols].values  # FYI: this is not ordered the same as "dwas"
#         grad_subset = dwa_grads[dwa_grads["DWA ID"].isin(dwas)][cols].values  # FYI: this is not ordered the same as "dwas" 
   
#         avg_features.append(feat_subset.mean(axis=0)) 
#         dwa_grads_per_set.append(grad_subset.mean(axis=0)) 
        
#     average_grads = np.vstack(dwa_grads_per_set).mean(axis=0)
#     avg_features =  np.vstack(avg_features).mean(axis=0) 
        

#     # # These are the characteristics of those tasks (in the sub-set)
#     neg_grads  = [(col, grad) for col, grad in sorted(zip(average_grads, cols))][:number_of_features_presented]
# #     poss_grads = [(col, grad) for col, grad in sorted(zip(average_grads, cols))][-number_of_features_presented:]  ## This means they are sorted differently (but gives you the same set of 10)
#     poss_grads = [(col, grad) for col, grad in sorted(zip(average_grads, cols), reverse=True)][:number_of_features_presented]

#     ### Print the results to latex:
#     print("\n %% AVERAGE FEATURE GRADIENTS:")    

#     for ind, (i, j) in enumerate(poss_grads):
#         if ind==0:
#             print(risk, " & ",  j , "(%s) & +%0.3f  \\\\ " % (characteristic_dict[j], i))
#         else:
#             print(" & ", j , "(%s) & +%0.3f  \\\\ " % (characteristic_dict[j], i))
            
# #         REMOVE NEGATIVE FEATURES            
# #     print("\\cdashline{2-3}")  
# #     for (i, j) in neg_grads:
# #         print(" & ", j , "(%s) & %0.3f  \\\\ " % (characteristic_dict[j], i)) 


#     if cnt != 2: print("\\midrule")
# label = "feature_grads"
# print_bot(label)  

%% taken from Healthcare_DWA_Gradients.ipynb,

\begin{table}[h!]     \caption{Largest feature differences relative to the dataset by risk category}     \centering \small     \begin{tabular}{@{}llc@{}} \toprule{}      \textbf{Risk Category} & \textbf{O*NET Feature} & \textbf{Feature Difference}  \\  \midrule

 %% not-automatable (6, 8)

 %% AVERAGE FEATURE DIFFERENCES:
not-automatable  &  Installation (skill) & +\SI{62.6}{\percent}  \\ 
 &  Building and Construction (knowledge) & +\SI{27.2}{\percent}  \\ 
 &  Personnel and Human Resources (knowledge) & +\SI{26.0}{\percent}  \\ 
 &  Management of Financial Resources (skill) & +\SI{18.7}{\percent}  \\ 
 &  Education and Training (knowledge) & +\SI{17.0}{\percent}  \\ 
\midrule

 %% automatable (47, 8)

 %% AVERAGE FEATURE DIFFERENCES:
automatable  &  Clerical (knowledge) & +\SI{24.4}{\percent}  \\ 
 &  Customer and Personal Service (knowledge) & +\SI{13.2}{\percent}  \\ 
 &  Service Orientation (skill) & +\SI{5.0}{\percent}  \\ 
 &  Econo

In [13]:
def print_top3(col_name, cap):
     print("\\begin{table}[h!] \
    \\caption*{%s} \
    \\centering \\small \
    \\begin{tabular}{@{}lc@{}} \
    \\textbf{O*NET Feature} & \\textbf{%s}  \\\\  \\midrule" % (cap, col_name))
        
def print_3_task_table(data, occu, cap, label, max_length = 55):
    display_number_of_tasks = 3
    
    display_cols = ["Task", "Weighted Average Automation Score"]
    dataset_tasks = data.sort_values(by="Weighted Average Automation Score")

    low = dataset_tasks.iloc[:display_number_of_tasks,:][display_cols]
    high = dataset_tasks.iloc[-display_number_of_tasks:,:].iloc[::-1][display_cols]
    

    print("\\begin{center} \
    \\begin{longtable}[t]{K{.1\linewidth}K{.6\linewidth}C{.15\linewidth}} \
    \\caption*{%s} \\label{%s} \\\\ \
    \\small \
       & \\textbf{Task} & \\textbf{Automation Score}  \\\\  \\midrule" % (cap, label))
 
    for cnt, (i, row) in enumerate(low.iterrows()):
        if cnt == 0:
            print("Lowest & %s & %0.3f \\\\" % (row[0], row[1] ) )
        else:
            print(" & %s & %0.3f \\\\" % (row[0], row[1] ) )
    print("\\cdashline{2-3}")
    
    for cnt, (i, row) in enumerate(high.iterrows()):
        if cnt == 0:
            print("Highest & %s & %0.3f \\\\" % (row[0], row[1] ) )    
        else:
            print(" &  %s & %0.3f \\\\" % (row[0], row[1] ) )

    print("\\bottomrule \
    \\end{longtable} \
    \end{center} \
    ")
    return


## This is Appendix data:

In [14]:
print("%% taken from Healthcare_DWA_Gradients.ipynb,\n")

number_of_features_presented = 5

for occu in unique_occupations:
    subset_of_data = HC_data[HC_data["Observed Occupation"] == occu]
    
    risk = occu.replace(" ","_")
    print("\n %%", risk, subset_of_data.shape)
    print("\\subsubsection*{%s}" % occu)
    
    # # In here - put the 3 most, and the 3 least automatable tasks... 
    caption = "\\textcolor{vubbleu}{%s}: Three most and least automatable tasks" % occu
    label = "app:tab:3tasks_%s" % risk
    print_3_task_table(subset_of_data, occu, caption, label)
    
    
    avg_features, dwa_grads_per_set = [],[]
    for index, row in subset_of_data.iterrows():
        dwas = [i.strip() for i in row['DWA ID'].split(";")]
        weights = [i.strip() for i in row['Task Weight'].split(";")]
        feat_subset = dwa_feats[dwa_feats["DWA ID"].isin(dwas)][cols].values  # FYI: this is not ordered the same as "dwas"
        grad_subset = dwa_grads[dwa_grads["DWA ID"].isin(dwas)][cols].values  # FYI: this is not ordered the same as "dwas" 
   
        avg_features.append(feat_subset.mean(axis=0)) 
        dwa_grads_per_set.append(grad_subset.mean(axis=0)) 
        
    average_grads = np.vstack(dwa_grads_per_set).mean(axis=0)
    avg_features =  np.vstack(avg_features).mean(axis=0) 
        
    # # These are the characteristics of those tasks (in the sub-set)
    neg_grads  = [(col, grad) for col, grad in sorted(zip(average_grads, cols))][:number_of_features_presented]
#     poss_grads = [(col, grad) for col, grad in sorted(zip(average_grads, cols))][-number_of_features_presented:]  ## This means they are sorted differently (but gives you the same set of 10)
    poss_grads = [(col, grad) for col, grad in sorted(zip(average_grads, cols), reverse=True)][:number_of_features_presented]
   
    
    ### Print the results to latex:
    print("\n %% AVERAGE FEATURE DIFFERENCES:")
    caption = "\\textcolor{vubbleu}{%s}: Largest feature differences relative to the population" % occu
    print_top3("Feature Difference", caption)
    mean_dataset_features = dwa_feats.mean(axis=0)
    percentage_diffs = ((avg_features - mean_dataset_features) / mean_dataset_features)*100    
    for i, j in sorted(zip( percentage_diffs, cols), reverse=True)[:number_of_features_presented]:
        print(j.replace("_", "\_")  , "(%s) & +\\SI{%0.1f}{\\percent}  \\\\ " % (characteristic_dict[j], i)) 
    print("\\midrule")    
    for i, j in sorted(zip(percentage_diffs, cols))[:number_of_features_presented]:
        print(j.replace("_", "\_")  , "(%s) & \\SI{%0.1f}{\\percent}  \\\\ " % (characteristic_dict[j], i)) 
    label = "tab:feature_diff_%s" % risk
    print_bot(label)
        
    
    print("\n %% AVERAGE FEATURE GRADIENTS:")    
    caption = "\\textcolor{vubbleu}{%s}: O*NET features with the largest (positive and negative) derivatives." % occu
    print_top3("Feature Gradient", caption)
    for (i, j) in neg_grads:
        print(j.replace("_", "\_")  , "(%s) & %0.3f  \\\\ " % (characteristic_dict[j], i)) 
    print("\\midrule")        
    for (i, j) in poss_grads:
        print(j.replace("_", "\_")  , "(%s) & +%0.3f  \\\\ " % (characteristic_dict[j], i))
    label = "tab:feature_grad_%s" % risk    
    print_bot(label)
    
    print("\\newpage")
    

%% taken from Healthcare_DWA_Gradients.ipynb,


 %% Administrator (40, 7)
\subsubsection*{Administrator}
\begin{center}     \begin{longtable}[t]{K{.1\linewidth}K{.6\linewidth}C{.15\linewidth}}     \caption*{\textcolor{vubbleu}{Administrator}: Three most and least automatable tasks} \label{app:tab:3tasks_Administrator} \\     \small        & \textbf{Task} & \textbf{Automation Score}  \\  \midrule
Lowest & Staff recruitment & 1.824 \\
 & Manage pension schemes & 2.108 \\
 & Write notes on paper & 2.235 \\
\cdashline{2-3}
Highest & Mass mail letters for checkups using DocMail & 3.725 \\
 &  Print letters & 3.609 \\
 &  Use texting service or patient management service to contact patients for different clinics and to send reminders. & 3.604 \\
\bottomrule     \end{longtable}     \end{center}     

 %% AVERAGE FEATURE DIFFERENCES:
\begin{table}[h!]     \caption*{\textcolor{vubbleu}{Administrator}: Largest feature differences relative to the population}     \centering \small     \begin{tabul

## Global Gradients:

In [15]:

def print_long_top(col_name, cap):
     print("\\begin{center} \
     \\begin{longtable}{@{}lc@{}} \
    \\caption{%s} \
    \\label{app:onet_variable} \\\\  \
    \\small \
    \\textbf{O*NET Feature} & \\textbf{%s}  \\\\  \\midrule" % (cap, col_name))

def print_long_bot(label):
    print("\\bottomrule \
    \\end{longtable} \
    \\label{tab:%s} \
    \\end{center}" % label)
    return

In [16]:
dwa_all_grads = []
for index, row in task_dataset.iterrows():
    dwas = [i.strip() for i in row['DWA ID'].split(";")]
    weights = [i.strip() for i in row['Task Weight'].split(";")]
    grad_subset = dwa_grads[dwa_grads["DWA ID"].isin(dwas)][cols].values  # FYI: this is not ordered the same as "dwas" 
    dwa_grads_per_set.append(grad_subset.mean(axis=0))
    
dwa_grads = np.vstack(dwa_grads_per_set).mean(axis=0)

In [18]:
pos_features_presented = int(np.ceil(dwa_grads).sum())
neg_features_presented = int(120 - np.ceil(dwa_grads).sum())

# # These are the characteristics of those tasks (in the sub-set)
neg_grads = [(col, grad) for col, grad in sorted(zip(average_grads, cols))][:neg_features_presented]
poss_grads = [(col, grad) for col, grad in sorted(zip(average_grads, cols), reverse=True)][:pos_features_presented]

risk = occu = "Global"
print("%% taken from Healthcare_DWA_Gradients.ipynb,\n")

print("\n %% GLOBAL AVERAGE FEATURE GRADIENTS:")    
caption = "\\textcolor{vubbleu}{%s}: All O*NET feature derivatives." % occu
print_long_top("Feature Gradient", caption)
for (i, j) in poss_grads:
    
    print(j.replace("_", "\_") , "(%s) & +%0.3f  \\\\ " % (characteristic_dict[j], i))

print("\\midrule")
for (i, j) in neg_grads:
    print(j.replace("_", "\_") , "(%s) & %0.3f  \\\\ " % (characteristic_dict[j], i)) 
label = "feature_grad_%s" % risk    
print_long_bot(label)


%% taken from Healthcare_DWA_Gradients.ipynb,


 %% GLOBAL AVERAGE FEATURE GRADIENTS:
\begin{center}      \begin{longtable}{@{}lc@{}}     \caption{\textcolor{vubbleu}{Global}: All O*NET feature derivatives.}     \label{app:onet_variable} \\      \small     \textbf{O*NET Feature} & \textbf{Feature Gradient}  \\  \midrule
Telecommunications (knowledge) & +0.167  \\ 
Clerical (knowledge) & +0.166  \\ 
Wrist-Finger Speed (ability) & +0.153  \\ 
Number Facility (ability) & +0.118  \\ 
Mathematics\_x (skill) & +0.093  \\ 
Depth Perception (ability) & +0.092  \\ 
Building and Construction (knowledge) & +0.090  \\ 
Mathematical Reasoning (ability) & +0.088  \\ 
Economics and Accounting (knowledge) & +0.085  \\ 
Control Precision (ability) & +0.082  \\ 
Response Orientation (ability) & +0.081  \\ 
Arm-Hand Steadiness (ability) & +0.077  \\ 
Sales and Marketing (knowledge) & +0.076  \\ 
Equipment Selection (skill) & +0.072  \\ 
Finger Dexterity (ability) & +0.067  \\ 
Perceptual Speed (ability) 