ONET Occupations Dataset
---

By Paul Duckworth 20th Sept 2017.

Create an ONET dataset of Occupations vs variables. 


Observational Data:
---

In [11]:
import os
import numpy as np
import pandas as pd
import getpass
import matplotlib.pyplot as plt
%matplotlib inline

datasets = '/home/'+ getpass.getuser() +'/Datasets/ONET/'
d = os.path.join(datasets, 'FoHC/FOH Occupations Tasks Features Technology.xlsx')
excel_doc = pd.ExcelFile(d)
matts_data = excel_doc.parse("Title, Tasks, Features").fillna("-")

matts_data.rename(columns = {'Occupation title':'Observed Occupation'}, inplace = True)
matts_data

FileNotFoundError: [Errno 2] No such file or directory: '/home/scpd/Datasets/ONET/FoHC/FOH Occupations Tasks Features Technology.xlsx'

## ONET occupation data:

Original occupation dataset only have SOC code, Title and description. 


In [None]:
occupations = pd.read_table(os.path.join(datasets, 'databases/db2016/Occupation Data.txt'), sep='\t')
occupations.rename(columns = {'Title':'O*NET Occupation title'}, inplace = True)

print(occupations.shape)
occupations.head()

# print occupations[occupations['O*NET-SOC Code'] == '43-6014.00']
# print occupations[occupations['O*NET-SOC Code'] == '43-6011.00']
# print occupations[occupations['O*NET-SOC Code'] == '43-9199.00']

# man_soc = '11-9111.00'



## Map Observed Occupations to ONET Occupation Codes

Can we define a mixture? i.e. Observed job is a weighted combination of ONET Occupations. 


In [None]:
# occupations_observed = {'Administrator' :  'Medical Records and Health Information Technician',
#     'General Practitioner':    'Family and General Practitioners',
#     'Healthcare Assistant':    ['Medical Assistants', 'Nursing Assistants'],
#     'Pharmacy technician':     'Pharmacy Technicians',
#     'Phlebotomist':            'Phlebotomists',
#     'Practice manager':        'Medical and Health Services Managers',
#     'Deputy Practice Manager': 'Medical and Health Services Managers',
#     'Practice nurse':          ['Nurse Practitioners', 'Registered Nurses'],
#     'Receptionist':            'Receptionists and Information Clerks',
#     'Scanning Clerk':          'Information and Record Clerks, All Other',
#     'Secretary':               'Medical Secretaries'}

# list_of_onet_occs = [item for sublist in occupations_mapping.keys() for item in sublist]

occupations_mapping = {'Medical Secretaries' :                      'Secretary',    
                       'Secretaries and Administrative Assistants' : 'Secretary',    
                       
                       'Family and General Practitioners':          'General Practitioner',
                       'Health Diagnosing and Treating Practitioners' : 'General Practitioner',
                       'Healthcare Practitioners and Technical Workers' : 'General Practitioner',
                       
                       'Medical Assistants':                        'Healthcare Assistant',
                       'Nursing Assistants':                        'Healthcare Assistant',
                       'Physician Assistants':                      'Healthcare Assistant',
                       'Surgical Assistants' :                      'Healthcare Assistant',
                       'Medical Appliance Technicians':             'Healthcare Assistant',
                       
                       'Pharmacy Technicians' :                     'Pharmacy technician',
                       
                           'Phlebotomist':                              'Phlebotomists',
                       
                       'Medical and Health Services Managers' :     'Practice Manager',
                       'Technical Directors/Managers' :             'Deputy Practice Manager',
                       
                       'Nurse Practitioners':                       'Practice nurse', 
                       'Registered Nurses':                         'Practice nurse',
                       'Acute Care Nurses':                         'Practice nurse',
                       'Licensed Practical and Licensed Vocational Nurses':  'Practice nurse',
                       'Critical Care Nurses':                      'Practice nurse',
                       
                       'Receptionists and Information Clerks' :     'Receptionist',
                       
                       'Information and Record Clerks, All Other' : 'Scanning Clerk',
                       'File Clerks':                               'Scanning Clerk',
                       'Office Clerks, General':                    'Scanning Clerk',
                       
                       'Medical Records and Health Information Technician': 'Administrator'
                       }

# Empty Frame
df = pd.DataFrame(columns=['Observed Occupation', 'O*NET-SOC Code', 'O*NET Occupation title', 'Description'])

# Add rows which contain occupations

for ind, (onet_occ, ob_occ) in enumerate(sorted(occupations_mapping.items())): 
    soc = occupations[occupations['O*NET Occupation title'].str.contains(onet_occ, na=False)]
    soc['Observed Occupation'] = ob_occ
    df = pd.concat([df, soc])

# df.index = [range(len(occupations_mapping))]

cols = ['Observed Occupation', 'O*NET Occupation title', 'O*NET-SOC Code','Description']
df = df[cols].rename(columns = {'Description':'O*NET Description'})
# df.to_csv('/home/paul/example.csv')
print df.shape
df

In [None]:
df = df.sort_values(by = 'Observed Occupation')

output_doc = os.path.join(datasets, 'FoHC/Observed_occu_mapping.csv')
df.to_csv(output_doc)

# output_doc = os.path.join(datasets, 'FoHC/all_onet_occs.csv')
# occupations.to_csv(output_doc)

In [None]:
matts_tasks = matts_data[['Observed Occupation', 'Task']]
df2 = pd.merge(df, matts_tasks,  how='left', left_on=['Observed Occupation'], right_on = ['Observed Occupation'])
cols = ['Observed Occupation', 'Task', 'O*NET Occupation title', 'O*NET-SOC Code','O*NET Description']
df2 = df2[cols]
df2.sort_values(by = 'Observed Occupation')
output_doc = os.path.join(datasets, 'FoHC/Observed_occu_mapping_withTasks.csv')
df2.to_csv(output_doc)

## Alternate Titles 

Somehow merge the other titles reported for each of the observed Occupation Titles

In [None]:
# # Alternate occupation titles per SOC:
# alt_titles = pd.read_table(os.path.join(datasets, 'databases/db2016/Alternate Titles.txt'), sep='\t')

# # # reported occupation titles 
# sample_titles = pd.read_table(os.path.join(datasets, 'databases/db2016/Sample of Reported Titles.txt'), sep='\t')

# for soc in df['O*NET-SOC Code']:
#     alt = alt_titles[ alt_titles["O*NET-SOC Code"].str.contains(soc) ]
#     print ">>> ", soc, alt 
#     for a in alt['Alternate Title']:
#         x = occupations[occupations['Occupation title'].str.contains(a, na=False)]

In [None]:
# Merging Matts to ONET based on Occupation doesn't work - more detail in the ONET occupation titles.
# How to merge? - assign SOC codes?  

# merged = pd.merge(data, admin, on='Occupation title', how='outer')
# merged.head() 

Merge Other ONET variables: 
---

In [None]:
pd.set_option('max_colwidth',1000)
tasks_all[tasks_all['Task ID']==827]

In [None]:
#Tasks per SOC:
tasks_all = pd.read_table(os.path.join(datasets, 'databases/db2016/Task Statements.txt'), sep='\t')
print "\nnumber of unique all Tasks = ", tasks_all["Task ID"].nunique()

tasks_red = tasks_all[ tasks_all['O*NET-SOC Code'].isin(df['O*NET-SOC Code'].values)]

vocabulary = dict(zip(tasks_red['Task ID'], tasks_red['Task']))
print "\nlength of vocabulary ", len(vocabulary.keys())
# print "\nvocabuary of task ids = ", vocabulary.keys()
print "\nan example: 777 ", vocabulary[777]
print "an example: 778 ", vocabulary[778]


variables = ['O*NET-SOC Code', 'Task ID']
tasks = tasks_red[variables]
tasks['bin'] = 1

pivot = tasks.pivot(index = 'O*NET-SOC Code', columns='Task ID', values='bin').fillna(0)
pivot

In [None]:
print "\nnumber of unique Tasks observed (for 12 occupations) = ", sum(pivot.sum(axis = 0)==1)

In [None]:
#Task DWAs (detailed work activitiy code):
taskDWA = pd.read_table(os.path.join(datasets, 'databases/db2016/Tasks to DWAs.txt'), sep='\t')
taskDWA = taskDWA[ taskDWA['O*NET-SOC Code'].isin(df['O*NET-SOC Code'].values)]
taskDWA[ taskDWA['O*NET-SOC Code'] == '43-6013.00']  # # Medical Secretaries

taskDWA[ taskDWA['DWA ID'] == '4.A.4.a.3.I03.D11']  
# taskDWA[ taskDWA['Task ID'].isin([744, 778])]  

In [None]:
#Task DWAs ratings:
task_rat = pd.read_table(os.path.join(datasets, 'databases/db2016/Task Ratings.txt'), sep='\t')
task_rat[ task_rat['Task ID'] == 777]    # # Answer telephones and direct calls

In [None]:
#Categorise Detailed Work Activities, 1-5 increasing frequency. 
task_cats = pd.read_table(os.path.join(datasets, 'databases/db2016/Task Categories.txt'), sep='\t')
task_cats

In [None]:
# tasks_green = pd.read_table(os.path.join(datasets, 'databases/db2016/Tasks to Green DWAs.txt'), sep='\t')
# tasks_green.head()

In [None]:
DWA = pd.read_table(os.path.join(datasets, 'databases/db2016/DWA Reference.txt'), sep='\t')
print "\nnumber of unique Detailed Work Activities = ", DWA["DWA ID"].nunique()
DWA = DWA[ DWA['DWA ID'].isin(taskDWA['DWA ID'].values)]   # restrict the DWA to the 12 SOCs observed 
print "number of unique DWA observed = ", DWA["DWA ID"].nunique()
DWA.head()

print "\nTask 777 falls under DWA: ", DWA[ DWA["DWA ID"] == "4.A.4.a.3.I03.D11"].iloc[0,3]

DWA[ DWA["DWA ID"] == "4.A.4.a.3.I03.D11"]


In [None]:
# Intermediate Work Activity and corresponding WA element ID. 
# Every IWA is linked to exactly one WA from the O*NET Content Model. 
# IWAs are linked to one or more DWAs; 

IWA = pd.read_table(os.path.join(datasets, 'databases/db2016/IWA Reference.txt'), sep='\t')
print "\nnumber of unique Intermediate Work Activities = ", IWA["IWA ID"].nunique()
IWA = IWA[ IWA['IWA ID'].isin(DWA['IWA ID'].values)]   # restrict the IWA to the 12 SOCs observed 
print "number of unique IWA observed = ", IWA["IWA ID"].nunique()
IWA.head()

print "\nTask 777 falls under IWA: ",  IWA[ IWA["IWA ID"] == "4.A.4.a.3.I03"].iloc[0,2]

IWA[ IWA["Element ID"] == "4.A.4.a.3"] 


In [None]:
# Work Analysis: Provide a mapping of SOC codes (occupations) to Work Activity ratings.
WA = pd.read_table(os.path.join(datasets, 'databases/db2016/Work Activities.txt'), sep='\t')
print "\nnumber of unique Work Activities = ", WA["Element ID"].nunique()

WA = WA[ WA['O*NET-SOC Code'].isin(df['O*NET-SOC Code'].values)]   # restrict the WA to the 12 SOCs observed 
print "number of unique WA observed = ", WA["Element ID"].nunique()
WA

print "\nMedical Secretary, WA 4.A.4.a.3 (task 777): ", 
WA[ (WA["Element ID"] == "4.A.4.a.3") & (WA["O*NET-SOC Code"] == '43-6013.00')] 
# WA[ WA["O*NET-SOC Code"] == '43-6013.00']    # Medical Secretaries


In [15]:
WA['Element_pivot'] =  WA['Element Name'] + " " +  WA['Scale ID']
WA_pivot = WA.pivot(index = 'O*NET-SOC Code', columns='Element_pivot', values='Data Value').fillna(0)
WA_pivot.reset_index(inplace=True)

WA_pivot

Element_pivot,O*NET-SOC Code,Analyzing Data or Information IM,Analyzing Data or Information LV,Assisting and Caring for Others IM,Assisting and Caring for Others LV,Coaching and Developing Others IM,Coaching and Developing Others LV,Communicating with Persons Outside Organization IM,Communicating with Persons Outside Organization LV,"Communicating with Supervisors, Peers, or Subordinates IM",...,Selling or Influencing Others IM,Selling or Influencing Others LV,Staffing Organizational Units IM,Staffing Organizational Units LV,Thinking Creatively IM,Thinking Creatively LV,Training and Teaching Others IM,Training and Teaching Others LV,Updating and Using Relevant Knowledge IM,Updating and Using Relevant Knowledge LV
0,11-9111.00,4.16,5.0,2.92,3.04,4.19,5.35,3.46,4.04,4.58,...,2.27,2.54,3.88,4.92,3.52,4.4,3.88,4.54,4.32,5.68
1,27-2012.05,3.27,3.17,3.52,3.29,3.37,3.57,4.08,4.39,4.4,...,3.26,3.33,2.3,2.04,4.08,4.79,3.17,3.03,4.35,5.21
2,29-1062.00,3.73,4.42,4.44,5.58,2.84,2.99,3.23,3.7,3.95,...,2.51,2.36,1.57,1.5,3.11,3.77,3.21,3.76,4.32,5.32
3,29-1071.00,4.35,4.75,4.85,6.35,2.75,3.3,3.55,3.75,4.55,...,2.75,2.6,2.0,1.8,3.4,4.0,3.45,3.7,4.65,5.8
4,29-1141.00,3.19,3.36,4.85,5.53,3.37,3.99,3.52,3.1,4.44,...,1.7,1.27,2.12,1.64,3.22,3.69,3.94,3.91,4.55,5.44
5,29-1141.01,3.48,4.37,4.78,6.48,3.65,4.88,2.96,3.15,4.42,...,2.37,2.52,3.19,3.46,3.26,3.74,3.85,4.3,4.11,5.15
6,29-1141.03,3.68,4.25,4.75,6.2,3.75,4.65,2.95,2.8,4.26,...,2.47,2.35,2.75,2.8,3.05,3.3,3.9,4.45,4.3,5.05
7,29-1171.00,4.03,4.87,4.9,6.4,3.77,4.87,3.7,4.3,4.47,...,2.87,3.0,2.37,3.14,3.33,4.1,3.93,4.69,4.73,6.0
8,29-2052.00,3.29,3.36,3.89,4.48,2.81,2.96,3.34,3.06,3.94,...,2.63,2.37,2.2,1.92,2.88,2.95,3.08,3.1,3.74,4.59
9,29-2061.00,3.49,3.78,4.86,6.19,3.63,4.47,3.66,4.02,4.71,...,2.43,2.06,2.67,2.91,3.21,3.29,3.94,3.98,4.27,5.5


In [21]:
# "Medical Records and Health Information Technicians Bright Outlook" Tasks:

# tasks = pd.read_table(os.path.join(datasets, 'databases/db2016/Task Statements.txt'), sep='\t')
# tasks[ tasks['O*NET-SOC Code'] == '29-2071.00']


In [16]:
#Education: 
edu = pd.read_table(os.path.join(datasets, 'databases/db2016/Education, Training, and Experience.txt'), sep='\t')
edu = edu[edu['Data Value'] != 0.0]
# edu.head()

# edu[ edu['O*NET-SOC Code'] == '29-2071.00']

In [17]:
#Technoogy:
tech = pd.read_table(os.path.join(datasets, 'databases/db2016/Tools and Technology.txt'), sep='\t')
# tech[tech['Hot Technology'] == 'Y'].head()
# tech.head()

# tech[ tech['O*NET-SOC Code'] == '43-6013.00']  # # Medical Records and Health Information Technicians


In [10]:
#Skills:
skills = pd.read_table(os.path.join(datasets, 'databases/db2016/Skills.txt'), sep='\t')
print("\nnumber of unique Skills = ", skills["Element ID"].nunique())
skills = skills[ skills['O*NET-SOC Code'].isin(df['O*NET-SOC Code'].values)]

#  skills[ skills['O*NET-SOC Code'] == '43-6013.00']

skills['Element_pivot'] =  skills['Element Name'] + " " +  skills['Scale ID']
skills



number of unique Skills =  35


NameError: name 'df' is not defined

In [25]:
# WA[ (WA["Element ID"] == "2.A.1.a")

# print skills['Element Name'].unique()
# print len(skills['Element Name'].unique())
# skills[ skills['O*NET-SOC Code'] == '43-4199.00']    # Not in Skills dataset 

In [19]:
skills_pivot = skills.pivot(index = 'O*NET-SOC Code', columns='Element_pivot', values='Data Value').fillna(0)
skills_pivot.reset_index(inplace=True)
skills_pivot

Element_pivot,O*NET-SOC Code,Active Learning IM,Active Learning LV,Active Listening IM,Active Listening LV,Complex Problem Solving IM,Complex Problem Solving LV,Coordination IM,Coordination LV,Critical Thinking IM,...,Systems Evaluation IM,Systems Evaluation LV,Technology Design IM,Technology Design LV,Time Management IM,Time Management LV,Troubleshooting IM,Troubleshooting LV,Writing IM,Writing LV
0,11-9111.00,3.88,4.0,4.0,4.0,3.75,3.38,4.0,4.25,4.12,...,3.75,3.5,2.0,1.12,4.0,4.0,1.25,0.25,4.0,4.0
1,27-2012.05,3.38,3.38,3.88,4.12,3.5,3.5,4.0,4.38,3.75,...,3.25,3.38,1.75,1.5,3.62,3.88,2.12,2.0,3.5,3.62
2,29-1062.00,3.75,4.0,4.25,4.75,4.0,4.12,3.25,4.0,4.38,...,2.88,3.38,1.25,0.38,3.62,3.88,1.5,0.62,4.12,4.25
3,29-1071.00,3.62,4.12,4.12,4.38,3.75,3.62,3.38,4.0,4.12,...,3.25,3.5,1.75,0.88,3.12,3.62,1.88,1.0,3.62,4.12
4,29-1141.00,3.38,3.75,4.12,4.0,3.38,3.25,3.88,4.12,3.88,...,2.88,3.25,1.75,0.88,3.0,3.38,1.75,1.5,3.62,3.62
5,29-1141.01,3.75,3.88,4.0,3.88,3.75,3.75,3.75,3.88,4.0,...,3.0,3.25,2.0,1.12,3.12,3.25,2.0,1.62,3.5,3.62
6,29-1141.03,3.88,4.0,4.12,4.0,3.62,3.38,3.88,4.0,4.0,...,2.75,3.0,1.88,1.0,3.25,3.75,2.25,2.12,3.25,3.88
7,29-1171.00,4.12,4.12,4.12,4.5,4.0,4.0,3.75,3.62,4.12,...,3.25,3.25,1.75,0.88,3.38,3.75,1.75,1.38,3.75,4.0
8,29-2052.00,2.88,3.0,3.88,3.88,3.0,2.62,3.0,3.0,3.25,...,2.12,1.88,1.62,0.62,3.12,2.75,1.75,0.88,2.88,2.88
9,29-2061.00,3.38,3.12,4.0,3.88,3.25,3.12,4.0,3.88,3.88,...,2.62,2.75,1.88,1.0,4.0,3.38,1.88,1.38,3.25,3.25


In [20]:
# result = pd.concat([df, skills_pivot], axis=1, join_axes=[df['O*NET-SOC Code']]) 

df_skills = pd.merge(df, skills_pivot,  how='left', left_on=['O*NET-SOC Code'], right_on = ['O*NET-SOC Code'])
df_skills # [result['O*NET-SOC Code'] == '29-1062.00']

occupation_level_skills_wa = pd.merge(df_skills, WA_pivot,  how='left', left_on=['O*NET-SOC Code'], right_on = ['O*NET-SOC Code'])
occupation_level_skills_wa = occupation_level_skills_wa.sort_values(by = 'Observed Occupation')
occupation_level_skills_wa

Unnamed: 0,Observed Occupation,O*NET Occupation title,O*NET-SOC Code,O*NET Description,Active Learning IM,Active Learning LV,Active Listening IM,Active Listening LV,Complex Problem Solving IM,Complex Problem Solving LV,...,Selling or Influencing Others IM,Selling or Influencing Others LV,Staffing Organizational Units IM,Staffing Organizational Units LV,Thinking Creatively IM,Thinking Creatively LV,Training and Teaching Others IM,Training and Teaching Others LV,Updating and Using Relevant Knowledge IM,Updating and Using Relevant Knowledge LV
0,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",2.5,2.5,3.0,3.12,2.75,2.62,...,1.29,0.41,1.45,0.8,2.85,3.03,2.53,1.9,4.11,5.02
1,Deputy Practice Manager,Technical Directors/Managers,27-2012.05,Coordinate activities of technical departments...,3.38,3.38,3.88,4.12,3.5,3.5,...,3.26,3.33,2.3,2.04,4.08,4.79,3.17,3.03,4.35,5.21
2,General Practitioner,Family and General Practitioners,29-1062.00,"Physicians who diagnose, treat, and help preve...",3.75,4.0,4.25,4.75,4.0,4.12,...,2.51,2.36,1.57,1.5,3.11,3.77,3.21,3.76,4.32,5.32
3,General Practitioner,"Health Diagnosing and Treating Practitioners, ...",29-1199.00,All health diagnosing and treating practitione...,,,,,,,...,,,,,,,,,,
4,General Practitioner,Healthcare Practitioners and Technical Workers...,29-9099.00,All healthcare practitioners and technical wor...,,,,,,,...,,,,,,,,,,
5,Healthcare Assistant,Physician Assistants,29-1071.00,Provide healthcare services typically performe...,3.62,4.12,4.12,4.38,3.75,3.62,...,2.75,2.6,2.0,1.8,3.4,4.0,3.45,3.7,4.65,5.8
6,Healthcare Assistant,Medical Appliance Technicians,51-9082.00,"Construct, fit, maintain, or repair medical su...",3.0,3.0,3.88,3.62,3.38,3.0,...,1.85,1.58,1.72,1.34,3.58,4.16,3.08,3.44,3.3,4.04
7,Healthcare Assistant,Medical Assistants,31-9092.00,Perform administrative and certain clinical du...,3.25,3.38,3.88,3.88,2.75,3.0,...,2.53,2.38,2.08,1.77,3.05,3.6,3.53,3.55,4.1,5.08
8,Healthcare Assistant,Surgical Assistants,29-2099.07,Assist surgeons during surgery by performing d...,3.0,3.12,4.0,3.88,3.25,3.12,...,2.54,2.42,1.9,1.33,3.35,3.86,3.61,4.43,4.27,5.49
9,Healthcare Assistant,Nursing Assistants,31-1014.00,Provide basic patient care under direction of ...,2.5,2.5,3.62,3.0,2.75,2.38,...,2.25,1.13,2.09,0.98,2.8,2.5,3.66,2.93,3.12,3.34


In [21]:
output_doc = os.path.join(datasets, 'FoHC/Observed_occ_onet_data.csv')
occupation_level_skills_wa.to_csv(output_doc)

# Task or Work Activity Level dataset

In [23]:
IWA.head()

Unnamed: 0,Element ID,IWA ID,IWA Title
1,4.A.1.a.1,4.A.1.a.1.I02,Read documents or materials to inform work pro...
3,4.A.1.a.1,4.A.1.a.1.I04,Gather information from physical or electronic...
11,4.A.1.a.1,4.A.1.a.1.I12,Collect information about patients or clients.
18,4.A.1.a.1,4.A.1.a.1.I19,Research healthcare issues.
22,4.A.1.a.1,4.A.1.a.1.I23,Interview people to obtain information.


In [24]:
tasks2 = pd.read_table(os.path.join(datasets, 'databases/db2016/Task Statements.txt'), sep='\t')
tasks2 = tasks2[ tasks2['O*NET-SOC Code'].isin(df['O*NET-SOC Code'].values)]
tasks2

Unnamed: 0,O*NET-SOC Code,Task ID,Task,Task Type,Incumbents Responding,Date,Domain Source
910,11-9111.00,49,Develop and maintain computerized record manag...,Core,26,07/2016,Occupational Expert
911,11-9111.00,46,"Direct, supervise and evaluate work activities...",Core,26,07/2016,Occupational Expert
912,11-9111.00,48,"Direct or conduct recruitment, hiring and trai...",Core,26,07/2016,Occupational Expert
913,11-9111.00,50,Develop and implement organizational policies ...,Core,26,07/2016,Occupational Expert
914,11-9111.00,51,"Conduct and administer fiscal operations, incl...",Core,26,07/2016,Occupational Expert
915,11-9111.00,55,"Maintain awareness of advances in medicine, co...",Core,26,07/2016,Occupational Expert
916,11-9111.00,58,"Plan, implement and administer programs and se...",Core,26,07/2016,Occupational Expert
917,11-9111.00,57,Prepare activity reports to inform management ...,Core,26,07/2016,Occupational Expert
918,11-9111.00,52,Establish work schedules and assignments for s...,Core,26,07/2016,Occupational Expert
919,11-9111.00,53,Maintain communication between governing board...,Core,26,07/2016,Occupational Expert


In [46]:
# # Dont do this: 

# task_pivot = task2.pivot(index = 'O*NET-SOC Code', columns='Task ID', values='Task ID')
# task_pivot.reset_index(inplace=True)
# task_pivot

## Try to obtain GT from Logan's survey on Future of Employment

In [31]:
d = os.path.join(datasets, 'FoEmployment/tasks_and_ratings.csv')
survey_input_data = pd.read_csv(d)
survey_input_data.rename(columns = {'title':'O*NET Occupation title'}, inplace = True)
survey_input_data

Unnamed: 0,O*NET-SOC Code,O*NET Occupation title,task,value,added,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,11-1011.00,Chief Executives,Direct or coordinate an organization's financi...,4.54,1,,,,
1,11-1011.00,Chief Executives,Appoint department heads or managers and assig...,4.48,1,,,,
2,11-1011.00,Chief Executives,Analyze operations to evaluate performance of ...,4.40,1,,,,
3,11-1011.00,Chief Executives,"Direct, plan, or implement policies, objective...",4.39,1,,,,
4,11-1011.00,Chief Executives,"Prepare budgets for approval, including those ...",4.17,1,,,,
5,11-3071.01,Transportation Managers,"Plan, organize, or manage the work of subordin...",4.29,1,,,,
6,11-3071.01,Transportation Managers,"Direct activities related to dispatching, rout...",4.26,1,,,,
7,11-3071.01,Transportation Managers,Monitor operations to ensure that staff member...,4.05,1,,,,
8,11-3071.01,Transportation Managers,Serve as contact persons for all workers withi...,4.04,1,,,,
9,11-3071.01,Transportation Managers,Implement schedule or policy changes for trans...,4.00,1,,,,


In [32]:
keep_cols = ['O*NET-SOC Code', 'O*NET Occupation title', 'task']
survey_input_data = survey_input_data[keep_cols]

survey_input_data_reduced = survey_input_data[ survey_input_data['O*NET-SOC Code'].isin(df['O*NET-SOC Code'].values)]
survey_input_data_reduced


Unnamed: 0,O*NET-SOC Code,O*NET Occupation title,task
145,29-2052.00,Pharmacy Technicians,Receive written prescription or refill request...
146,29-2052.00,Pharmacy Technicians,"Prepack bulk medicines, fill bottles with pres..."
147,29-2052.00,Pharmacy Technicians,"Answer telephones, responding to questions or ..."
148,29-2052.00,Pharmacy Technicians,Maintain proper storage and security condition...
149,29-2052.00,Pharmacy Technicians,Assist customers by answering simple questions...


In [33]:
# Compare survey tasks to Tasks database
survey_input_data[ survey_input_data['task'].isin(tasks_red['Task'].values)]

Unnamed: 0,O*NET-SOC Code,O*NET Occupation title,task
132,29-1063.00,"Internists, General",Explain procedures and discuss test results or...
145,29-2052.00,Pharmacy Technicians,Receive written prescription or refill request...
146,29-2052.00,Pharmacy Technicians,"Prepack bulk medicines, fill bottles with pres..."
147,29-2052.00,Pharmacy Technicians,"Answer telephones, responding to questions or ..."
148,29-2052.00,Pharmacy Technicians,Maintain proper storage and security condition...
149,29-2052.00,Pharmacy Technicians,Assist customers by answering simple questions...


## Try using Frey Obsborne paper Training set

In [38]:
d = os.path.join(datasets, 'FoEmployment/frey_osborne_data.xlsx')
excel_doc = pd.ExcelFile(d)
frey_osb_data = excel_doc.parse("Sheet1").fillna("-")

# Remove final 2 dps in SOC Code:
keep_codes = [code[:-3] for code in df['O*NET-SOC Code'].values]
print df['O*NET-SOC Code'].values
# print keep_codes

print "\nnumber overlapping occu codes =", frey_osb_data[frey_osb_data['BLS codes'].isin(keep_codes)].shape[0]
frey_osb_data[frey_osb_data['BLS codes'].isin(keep_codes)]


['29-2071.00' '27-2012.05' '29-1062.00' '29-1199.00' '29-9099.00'
 '29-1071.00' '51-9082.00' '31-9092.00' '29-2099.07' '31-1014.00'
 '29-2052.00' '31-9097.00' '11-9111.00' '29-1141.00' '29-1141.01'
 '29-2061.00' '29-1141.03' '29-1171.00' '43-4171.00' '43-9061.00'
 '43-4199.00' '43-4071.00' '43-6014.00' '43-6013.00']

number overlapping occu codes =  14


Unnamed: 0,BLS codes,Occupation Name,Probability of Computerisation,Category Label,Training set automatable labels
36,11-9111,Medical and Health Services Managers,0.007318,1,-
85,29-1199,Health Diagnosing and Treating Practitioners A...,0.020303,4,-
92,27-2012,Producers and Directors,0.021637,3,-
144,29-2061,Licensed Practical and Licensed Vocational Nurses,0.057824,4,-
190,29-1071,Physician Assistants,0.14484,4,-
239,31-9092,Medical Assistants,0.3003,5,-
284,51-9082,Medical Appliance Technicians,0.44629,11,-
447,43-6013,Medical Secretaries,0.81463,7,-
562,29-2052,Pharmacy Technicians,0.91717,4,-
550,29-2071,Medical Records and Health Information Technic...,0.91004,4,-


# Mapping Observed Tasks to ONET Tasks

In [36]:
tasks_red

Unnamed: 0,O*NET-SOC Code,Task ID,Task,Task Type,Incumbents Responding,Date,Domain Source
910,11-9111.00,49,Develop and maintain computerized record manag...,Core,26,07/2016,Occupational Expert
911,11-9111.00,46,"Direct, supervise and evaluate work activities...",Core,26,07/2016,Occupational Expert
912,11-9111.00,48,"Direct or conduct recruitment, hiring and trai...",Core,26,07/2016,Occupational Expert
913,11-9111.00,50,Develop and implement organizational policies ...,Core,26,07/2016,Occupational Expert
914,11-9111.00,51,"Conduct and administer fiscal operations, incl...",Core,26,07/2016,Occupational Expert
915,11-9111.00,55,"Maintain awareness of advances in medicine, co...",Core,26,07/2016,Occupational Expert
916,11-9111.00,58,"Plan, implement and administer programs and se...",Core,26,07/2016,Occupational Expert
917,11-9111.00,57,Prepare activity reports to inform management ...,Core,26,07/2016,Occupational Expert
918,11-9111.00,52,Establish work schedules and assignments for s...,Core,26,07/2016,Occupational Expert
919,11-9111.00,53,Maintain communication between governing board...,Core,26,07/2016,Occupational Expert


In [111]:
df  # reduced set of occupations 
tasks_red  # tasks belonging to those occupations

df_tasks = pd.merge(df, tasks_red,  how='left', left_on=['O*NET-SOC Code'], right_on = ['O*NET-SOC Code'])
df_tasks = df_tasks.sort_values(by = 'Observed Occupation')

cols = ['Observed Occupation', 'Task ID', 'Task', 'O*NET Occupation title', 'O*NET-SOC Code', 'O*NET Description']
df_tasks = df_tasks[cols]
df_tasks

Unnamed: 0,Observed Occupation,Task ID,Task,O*NET Occupation title,O*NET-SOC Code,O*NET Description
0,Administrator,530.0,Protect the security of medical records to ens...,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records..."
19,Administrator,549.0,Develop in-service educational materials.,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records..."
18,Administrator,547.0,Consult classification manuals to locate infor...,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records..."
17,Administrator,544.0,Process and prepare business or government forms.,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records..."
16,Administrator,541.0,Train medical records staff.,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records..."
15,Administrator,548.0,Compile medical care and census data for stati...,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records..."
14,Administrator,537.0,Manage the department or supervise clerical wo...,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records..."
12,Administrator,543.0,Post medical insurance billings.,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records..."
11,Administrator,546.0,"Prepare statistical reports, narrative reports...",Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records..."
10,Administrator,536.0,"Plan, develop, maintain, or operate a variety ...",Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records..."


In [34]:
df_tasks.sort_values(by = 'Observed Occupation')
output_doc = os.path.join(datasets, 'FoHC/Observed_task_mappings.csv')
df_tasks.to_csv(output_doc)


In [56]:
len(matts_data['Task'].unique())
# matts_tasks

# Match these to ONET Task IDs or to DWA's 

#Task DWAs (detailed work activitiy code):
taskDWA = pd.read_table(os.path.join(datasets, 'databases/db2016/Tasks to DWAs.txt'), sep='\t')
taskDWA = taskDWA[['Task ID', 'DWA ID']]

print taskDWA.shape, "UNIQUE DWA: ", len(taskDWA['DWA ID'].unique()) 
df_tasks_dwa = pd.merge(df_tasks, taskDWA, how='left', left_on=['Task ID'], right_on = ['Task ID'])
df_tasks_dwa



(22838, 2) UNIQUE DWA:  2070


Unnamed: 0,Observed Occupation,O*NET Occupation title,O*NET-SOC Code,O*NET Description,Task ID,Task,Task Type,Incumbents Responding,Date,Domain Source,DWA ID
0,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",530,Protect the security of medical records to ens...,Core,141,07/2013,Incumbent,
1,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",549,Develop in-service educational materials.,Supplemental,140,07/2013,Incumbent,4.A.3.b.6.I12.D06
2,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",547,Consult classification manuals to locate infor...,Supplemental,141,07/2013,Incumbent,4.A.2.b.3.I01.D12
3,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",544,Process and prepare business or government forms.,Supplemental,141,07/2013,Incumbent,4.A.3.b.6.I06.D01
4,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",544,Process and prepare business or government forms.,Supplemental,141,07/2013,Incumbent,4.A.4.c.1.I01.D03
5,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",541,Train medical records staff.,Supplemental,140,07/2013,Incumbent,4.A.4.b.3.I06.D11
6,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",548,Compile medical care and census data for stati...,Supplemental,140,07/2013,Incumbent,
7,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",537,Manage the department or supervise clerical wo...,Supplemental,141,07/2013,Incumbent,4.A.4.b.4.I01.D11
8,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",537,Manage the department or supervise clerical wo...,Supplemental,141,07/2013,Incumbent,4.A.4.b.4.I12.D38
9,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",543,Post medical insurance billings.,Supplemental,142,07/2013,Incumbent,4.A.4.c.1.I03.D03


In [112]:

## ADD DWA and IWA titles:
DWAref = pd.read_table(os.path.join(datasets, 'databases/db2016/DWA Reference.txt'), sep='\t')[['DWA ID', 'DWA Title']]
df_tasks_dwa2 = pd.merge(df_tasks_dwa, DWAref,  how='left', left_on=['DWA ID'], right_on = ['DWA ID'])
df_tasks_dwa2
# taskDWA[taskDWA['Task ID'] == 530]



Unnamed: 0,Observed Occupation,O*NET Occupation title,O*NET-SOC Code,O*NET Description,Task ID,Task,Task Type,Incumbents Responding,Date,Domain Source,DWA ID,DWA Title
0,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",530,Protect the security of medical records to ens...,Core,141,07/2013,Incumbent,,
1,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",549,Develop in-service educational materials.,Supplemental,140,07/2013,Incumbent,4.A.3.b.6.I12.D06,Prepare healthcare training materials.
2,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",547,Consult classification manuals to locate infor...,Supplemental,141,07/2013,Incumbent,4.A.2.b.3.I01.D12,Maintain medical or professional knowledge.
3,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",544,Process and prepare business or government forms.,Supplemental,141,07/2013,Incumbent,4.A.3.b.6.I06.D01,Prepare official health documents or records.
4,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",544,Process and prepare business or government forms.,Supplemental,141,07/2013,Incumbent,4.A.4.c.1.I01.D03,Process healthcare paperwork.
5,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",541,Train medical records staff.,Supplemental,140,07/2013,Incumbent,4.A.4.b.3.I06.D11,Train caregivers or other non-medical personnel.
6,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",548,Compile medical care and census data for stati...,Supplemental,140,07/2013,Incumbent,,
7,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",537,Manage the department or supervise clerical wo...,Supplemental,141,07/2013,Incumbent,4.A.4.b.4.I01.D11,Supervise medical support personnel.
8,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",537,Manage the department or supervise clerical wo...,Supplemental,141,07/2013,Incumbent,4.A.4.b.4.I12.D38,Manage healthcare operations.
9,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",543,Post medical insurance billings.,Supplemental,142,07/2013,Incumbent,4.A.4.c.1.I03.D03,Process medical billing information.


In [113]:
cols = ['Observed Occupation', 'Task ID', 'Task',  'DWA ID', 'DWA Title', 'O*NET Occupation title', 'O*NET-SOC Code', 'O*NET Description']
df_tasks.sort_values(by = ['Observed Occupation', 'Task ID'])
output_doc = os.path.join(datasets, 'FoHC/Observed_task_mappings_withDWA.csv')
df_tasks_dwa2[cols].to_csv(output_doc)

In [115]:
df_tasks_dwa3 = df_tasks_dwa2.drop_duplicates(subset=['DWA ID'], keep='first', inplace=False)
df_tasks_dwa3

Unnamed: 0,Observed Occupation,O*NET Occupation title,O*NET-SOC Code,O*NET Description,Task ID,Task,Task Type,Incumbents Responding,Date,Domain Source,DWA ID,DWA Title
0,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",530,Protect the security of medical records to ens...,Core,141,07/2013,Incumbent,,
1,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",549,Develop in-service educational materials.,Supplemental,140,07/2013,Incumbent,4.A.3.b.6.I12.D06,Prepare healthcare training materials.
2,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",547,Consult classification manuals to locate infor...,Supplemental,141,07/2013,Incumbent,4.A.2.b.3.I01.D12,Maintain medical or professional knowledge.
3,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",544,Process and prepare business or government forms.,Supplemental,141,07/2013,Incumbent,4.A.3.b.6.I06.D01,Prepare official health documents or records.
4,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",544,Process and prepare business or government forms.,Supplemental,141,07/2013,Incumbent,4.A.4.c.1.I01.D03,Process healthcare paperwork.
5,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",541,Train medical records staff.,Supplemental,140,07/2013,Incumbent,4.A.4.b.3.I06.D11,Train caregivers or other non-medical personnel.
7,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",537,Manage the department or supervise clerical wo...,Supplemental,141,07/2013,Incumbent,4.A.4.b.4.I01.D11,Supervise medical support personnel.
8,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",537,Manage the department or supervise clerical wo...,Supplemental,141,07/2013,Incumbent,4.A.4.b.4.I12.D38,Manage healthcare operations.
9,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",543,Post medical insurance billings.,Supplemental,142,07/2013,Incumbent,4.A.4.c.1.I03.D03,Process medical billing information.
10,Administrator,Medical Records and Health Information Technic...,29-2071.00,"Compile, process, and maintain medical records...",546,"Prepare statistical reports, narrative reports...",Supplemental,141,07/2013,Incumbent,4.A.3.b.6.I03.D10,Present medical research reports.


In [128]:
# # All DWAs for Medical Occupations
cols = ['DWA ID', 'DWA Title']
just_medic_dwas = df_tasks_dwa3.sort_values(by = ['DWA ID']).reset_index()[cols]


# All the medical tasks that are performed by those DWAs
df_tasks_dwa4 = df_tasks_dwa2.drop_duplicates(subset=['DWA ID', 'Task ID'], keep='first', inplace=False)    
cols = ['DWA ID', 'DWA Title', 'Task', 'Task ID']
tasks_for_those_dwas = df_tasks_dwa4.sort_values(by = ['DWA ID']).reset_index()[cols]


# All tasks that are performed by those DWAs
all_dwa_tasks = taskDWA[taskDWA['DWA ID'].isin(just_medic_dwas['DWA ID'])].sort_values(by = ['DWA ID'])

df_tasks_dwa4 = pd.merge(all_dwa_tasks, DWAref, how='left', left_on=['DWA ID'], right_on = ['DWA ID'])

# Fix this: 
test = pd.merge(df_tasks_dwa4, df_tasks, how='left', left_on=['Task ID'], right_on = ['Task ID'])
test.drop_duplicates(subset=['Observed Occupation', 'Task ID'], keep='first', inplace=False)  

Unnamed: 0,Task ID,DWA ID,DWA Title,Observed Occupation,Task,O*NET Occupation title,O*NET-SOC Code,O*NET Description
0,11273,4.A.1.a.1.I02.D01,Read materials to determine needed actions.,,,,,
1,2599,4.A.1.a.1.I02.D01,Read materials to determine needed actions.,Scanning Clerk,Scan or read incoming materials to determine h...,File Clerks,43-4071.00,"File correspondence, cards, invoices, receipts..."
2,2772,4.A.1.a.1.I02.D01,Read materials to determine needed actions.,,,,,
3,12449,4.A.1.a.1.I02.D04,Read work orders or other instructions to dete...,,,,,
4,10065,4.A.1.a.1.I02.D04,Read work orders or other instructions to dete...,,,,,
5,12346,4.A.1.a.1.I02.D04,Read work orders or other instructions to dete...,,,,,
6,12655,4.A.1.a.1.I02.D04,Read work orders or other instructions to dete...,,,,,
7,14233,4.A.1.a.1.I02.D04,Read work orders or other instructions to dete...,,,,,
8,12022,4.A.1.a.1.I02.D04,Read work orders or other instructions to dete...,,,,,
9,14295,4.A.1.a.1.I02.D04,Read work orders or other instructions to dete...,,,,,


In [129]:
output_doc = os.path.join(datasets, 'FoHC/mapping_DWA.csv')
just_medic_dwas.to_csv(output_doc)

output_doc = os.path.join(datasets, 'FoHC/mapping_DWA_inc_tasks.csv')
tasks_for_those_dwas.to_csv(output_doc)
