Map ONET Skills onto Tasks via Occupations Dataset
---

By Paul Duckworth 17th Nov 2017.

Create a skills/abilies vector per (DWA) task from ONET datasets: Skills, Abilities, Occupations, Tasks, DWAs


In [47]:
import os
import numpy as np
import pandas as pd
import getpass
import cPickle as pickle
from random import shuffle
%matplotlib inline

# Point this at your ONET data: 
datasets = '/home/'+ getpass.getuser() +'/Datasets/'
print datasets

# survey_data_= pd.read_csv(os.path.join(datasets, 'FoEmployment/fow-expert-survey/data/cleaned/counts_data_with_metadata.csv'))
survey_data_ = pd.read_csv(os.path.join(datasets, 'FoEmployment/fow-expert-survey/data/cleaned/2018_1_12_cleaned.csv'))

# Investigate skills: 
# s = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Skills.txt'), sep='\t')
# s.rename(columns = {'Title':'Occupation title'}, inplace = True)
# s[s['O*NET-SOC Code'].apply(lambda x: x[-1] != "0")] #['O*NET-SOC Code'].unique()

/home/scpd/Datasets/


# ONET Datasets:

## Occupations and Tasks 

In [6]:
occupations = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Occupation Data.txt'), sep='\t')
occupations.rename(columns = {'Title':'Occupation title'}, inplace = True)
occupations.head()

Unnamed: 0,O*NET-SOC Code,Occupation title,Description
0,11-1011.00,Chief Executives,Determine and formulate policies and provide o...
1,11-1011.03,Chief Sustainability Officers,"Communicate and coordinate with management, sh..."
2,11-1021.00,General and Operations Managers,"Plan, direct, or coordinate the operations of ..."
3,11-1031.00,Legislators,"Develop, introduce or enact laws and statutes ..."
4,11-2011.00,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici..."


In [7]:
tasks = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Task Statements.txt'), sep='\t')
tasks = tasks[['O*NET-SOC Code', 'Task ID', 'Task']]

reduce_tasks = tasks['O*NET-SOC Code'].unique()#[:2]
tasks = tasks[tasks['O*NET-SOC Code'].isin(reduce_tasks)]      # reduce the task matrix for now :)
print tasks.shape, "UNIQUE tasks: ", len(tasks['Task ID'].unique())  # All tasks are unique to Occupation it seems
tasks.head()

# Tasks do not overlap between ONET Codes which seem to be hierarchical. 
# tasks[tasks['O*NET-SOC Code'].isin(['11-1011.03','11-1011.00'])]

(19566, 3) UNIQUE tasks:  19566


Unnamed: 0,O*NET-SOC Code,Task ID,Task
0,11-1011.00,8823,Direct or coordinate an organization's financi...
1,11-1011.00,8831,Appoint department heads or managers and assig...
2,11-1011.00,8825,Analyze operations to evaluate performance of ...
3,11-1011.00,8826,"Direct, plan, or implement policies, objective..."
4,11-1011.00,8827,"Prepare budgets for approval, including those ..."


In [8]:
#Task DWAs (detailed work activitiy code):
taskDWA = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Tasks to DWAs.txt'), sep='\t')
taskDWA = taskDWA[['Task ID', 'DWA ID']]

print taskDWA.shape, "UNIQUE DWA: ", len(taskDWA['DWA ID'].unique()) 
taskDWA.head()

(22838, 2) UNIQUE DWA:  2070


Unnamed: 0,Task ID,DWA ID
0,20461,4.A.2.a.4.I09.D03
1,20461,4.A.4.b.6.I08.D04
2,8823,4.A.4.b.4.I09.D02
3,8824,4.A.4.a.2.I03.D14
4,8825,4.A.2.a.4.I07.D09


In [9]:
df = pd.merge(tasks, taskDWA,  how='left', left_on=['Task ID'], right_on = ['Task ID']).sort_values(by = 'Task ID')
df = df[df['DWA ID'].notnull()]
df['IWA ID'] = df['DWA ID'].str.slice(0,-4)    # create IWA ID
df['WA ID'] = df['DWA ID'].str.slice(0,-8)     # create WA ID

## ADD DWA and IWA titles:
DWAref = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/DWA Reference.txt'), sep='\t')[['DWA ID', 'DWA Title']]
taskDWA2 = pd.merge(df, DWAref,  how='left', left_on=['DWA ID'], right_on = ['DWA ID'])

IWAref = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/IWA Reference.txt'), sep='\t')[['IWA ID', 'IWA Title']]
df2 = pd.merge(df, IWAref,  how='left', left_on=['IWA ID'], right_on = ['IWA ID'])

# cols = ['Task ID', 'Task', 'DWA ID', 'DWA Title', 'IWA ID', 'IWA Title', 'WA ID']
cols = ['O*NET-SOC Code', 'Task ID', 'DWA ID', 'IWA ID', 'WA ID']
df2 = df2[cols]

# tasks are many-to-many with DWA, e.g. task id=8826.
print df2.shape, "UNIQUE DWA: ", len(df2['DWA ID'].unique())
df2.head()

(22838, 5) UNIQUE DWA:  2070


Unnamed: 0,O*NET-SOC Code,Task ID,DWA ID,IWA ID,WA ID
0,11-2022.00,1,4.A.4.a.8.I03.D05,4.A.4.a.8.I03,4.A.4.a.8
1,11-2022.00,2,4.A.1.a.1.I14.D04,4.A.1.a.1.I14,4.A.1.a.1
2,11-2022.00,3,4.A.4.b.4.I12.D03,4.A.4.b.4.I12,4.A.4.b.4
3,11-2022.00,4,4.A.2.b.4.I01.D06,4.A.2.b.4.I01,4.A.2.b.4
4,11-2022.00,5,4.A.2.a.4.I11.D06,4.A.2.a.4.I11,4.A.2.a.4


# Task Importance to a DWA (weight)

In [10]:
# Task Importance: (Each task is unique to it's occupation)
task_rates = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Task Ratings.txt'), sep='\t')
task_im = task_rates[task_rates['Scale ID'] == 'IM']
task_im.rename(columns = {'Data Value':'Task IM'}, inplace = True)

task_im = task_im[['Task ID', 'Task IM']].sort_values('Task ID')
print task_im.shape
df3 = pd.merge(df2, task_im, how='left', left_on=['Task ID'], right_on = ['Task ID'])
print df3.shape
# Remove Task if no Task Importance: 
df3 = df3[df3['Task IM'].notnull()]   
print df3.shape  # 500 missing Task IM 
df3

(19125, 2)
(22838, 6)
(22365, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


Unnamed: 0,O*NET-SOC Code,Task ID,DWA ID,IWA ID,WA ID,Task IM
0,11-2022.00,1,4.A.4.a.8.I03.D05,4.A.4.a.8.I03,4.A.4.a.8,4.09
1,11-2022.00,2,4.A.1.a.1.I14.D04,4.A.1.a.1.I14,4.A.1.a.1,3.57
2,11-2022.00,3,4.A.4.b.4.I12.D03,4.A.4.b.4.I12,4.A.4.b.4,4.22
3,11-2022.00,4,4.A.2.b.4.I01.D06,4.A.2.b.4.I01,4.A.2.b.4,3.61
4,11-2022.00,5,4.A.2.a.4.I11.D06,4.A.2.a.4.I11,4.A.2.a.4,3.95
5,11-2022.00,6,4.A.4.b.4.I12.D03,4.A.4.b.4.I12,4.A.4.b.4,3.48
6,11-2022.00,7,4.A.4.a.2.I03.D14,4.A.4.a.2.I03,4.A.4.a.2,3.25
7,11-2022.00,8,4.A.4.b.6.I05.D10,4.A.4.b.6.I05,4.A.4.b.6,3.26
8,11-2022.00,9,4.A.2.b.1.I03.D04,4.A.2.b.1.I03,4.A.2.b.1,3.59
9,11-2022.00,9,4.A.4.b.4.I09.D04,4.A.4.b.4.I09,4.A.4.b.4,3.59


In [52]:
by = ['DWA ID']
task_im_by_dwa = df3.groupby(by).sum().reset_index()[['DWA ID','Task IM']]
task_im_by_dwa.rename(columns = {'Task IM' : 'IM per DWA Sum'}, inplace = True)

df4_ = pd.merge(df3, task_im_by_dwa,  how='left', left_on=['DWA ID'], right_on = ['DWA ID'])
df4_['Task IM per DWA Weight'] = df4_['Task IM'] / df4_['IM per DWA Sum']

print df4_[df4_['DWA ID']=='4.A.4.a.8.I03.D05']['Task IM per DWA Weight'].sum() 
df4_.head()


1.0


Unnamed: 0,O*NET-SOC Code,Task ID,DWA ID,IWA ID,WA ID,Task IM,IM per DWA Sum,Task IM per DWA Weight
0,11-2022.00,1,4.A.4.a.8.I03.D05,4.A.4.a.8.I03,4.A.4.a.8,4.09,84.74,0.048265
1,11-2022.00,2,4.A.1.a.1.I14.D04,4.A.1.a.1.I14,4.A.1.a.1,3.57,33.92,0.105248
2,11-2022.00,3,4.A.4.b.4.I12.D03,4.A.4.b.4.I12,4.A.4.b.4,4.22,56.68,0.074453
3,11-2022.00,4,4.A.2.b.4.I01.D06,4.A.2.b.4.I01,4.A.2.b.4,3.61,28.76,0.125522
4,11-2022.00,5,4.A.2.a.4.I11.D06,4.A.2.a.4.I11,4.A.2.a.4,3.95,17.99,0.219566


# Occupation Importance to a Task (weight)

In [12]:
by = ['O*NET-SOC Code']
task_im_by_occu = df3.groupby(by).sum().reset_index()[['O*NET-SOC Code','Task IM']]
task_im_by_occu.rename(columns = {'Task IM' : 'IM per Occu Sum'}, inplace = True)
# task_im_by_occu
df4 = pd.merge(df4_, task_im_by_occu,  how='left', left_on=['O*NET-SOC Code'], right_on = ['O*NET-SOC Code'])
df4['Task IM per Occu Weight'] = df4['Task IM'] / df4['IM per Occu Sum']
df4.head()


Unnamed: 0,O*NET-SOC Code,Task ID,DWA ID,IWA ID,WA ID,Task IM,IM per DWA Sum,Task IM per DWA Weight,IM per Occu Sum,Task IM per Occu Weight
0,11-2022.00,1,4.A.4.a.8.I03.D05,4.A.4.a.8.I03,4.A.4.a.8,4.09,84.74,0.048265,68.03,0.060121
1,11-2022.00,2,4.A.1.a.1.I14.D04,4.A.1.a.1.I14,4.A.1.a.1,3.57,33.92,0.105248,68.03,0.052477
2,11-2022.00,3,4.A.4.b.4.I12.D03,4.A.4.b.4.I12,4.A.4.b.4,4.22,56.68,0.074453,68.03,0.062031
3,11-2022.00,4,4.A.2.b.4.I01.D06,4.A.2.b.4.I01,4.A.2.b.4,3.61,28.76,0.125522,68.03,0.053065
4,11-2022.00,5,4.A.2.a.4.I11.D06,4.A.2.a.4.I11,4.A.2.a.4,3.95,17.99,0.219566,68.03,0.058063


# Do Not: Weight by Number of Employees

In [13]:
# # Use only the .00 Occupation codes: 

# emp_data = pd.read_csv(os.path.join(datasets, 'ONET/employment_figures_including_doubles.csv'))
# employment_data = emp_data[emp_data['O*NET-SOC Code'].apply(lambda x: ".00" in x)]#['O*NET-SOC Code'].unique()

# df5 = pd.merge(df4, employment_data[["Employment", "O*NET-SOC Code"]],  how='right', left_on=['O*NET-SOC Code'], right_on = ['O*NET-SOC Code'])
# print df4.shape, " vs ", df5.shape
# df5.head()


In [14]:
# emp_dwa = df5.groupby(['DWA ID']).sum().reset_index()[['DWA ID', 'Employment']]
# emp_dwa.rename(columns = {'Employment':'Employment per DWA Sum'}, inplace=True)

# df5_e = pd.merge(df5, emp_dwa,  how='left', left_on=['DWA ID'], right_on = ['DWA ID'])
# df5_e['Employment per DWA Norm'] = df5_e['Employment']/df5_e['Employment per DWA Sum']


## Skills by Occupations

In [15]:
skills = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Skills.txt'), sep='\t', low_memory=False)
skills.head()
skills = skills[skills['Scale ID'] == 'IM']
# skills['Element_pivot'] =  skills['Element Name'] + " " +  skills['Scale ID']
skills_pivot = skills.pivot(index = 'O*NET-SOC Code', columns='Element Name', values='Data Value').fillna(0)
skills_pivot.reset_index(inplace=True)
print skills_pivot.shape
skills_pivot.head()

(964, 36)


Element Name,O*NET-SOC Code,Active Learning,Active Listening,Complex Problem Solving,Coordination,Critical Thinking,Equipment Maintenance,Equipment Selection,Installation,Instructing,...,Science,Service Orientation,Social Perceptiveness,Speaking,Systems Analysis,Systems Evaluation,Technology Design,Time Management,Troubleshooting,Writing
0,11-1011.00,4.0,4.12,4.38,4.25,4.38,1.0,1.12,1.0,3.12,...,1.88,3.12,4.25,4.38,4.12,4.12,1.75,4.0,1.0,4.0
1,11-1011.03,3.5,3.88,4.0,3.62,4.0,1.0,1.12,1.0,3.25,...,1.75,3.25,3.75,4.0,3.62,3.62,1.62,3.38,1.12,3.88
2,11-1021.00,3.5,4.0,3.5,4.0,3.88,1.0,1.0,1.0,3.12,...,1.88,3.25,4.0,4.0,3.0,3.0,1.88,3.75,2.0,3.25
3,11-2011.00,3.25,4.0,3.5,3.5,3.75,1.0,1.25,1.0,2.88,...,1.5,3.12,4.0,4.0,3.12,3.0,1.62,3.88,1.12,3.75
4,11-2021.00,3.88,3.88,3.62,3.5,3.88,1.0,1.0,1.0,3.0,...,1.75,3.12,3.88,3.88,3.25,3.5,1.75,3.5,1.0,3.25


In [16]:
# WA['Element_pivot'] =  WA['Element Name'] + " " +  WA['Scale ID']
# WA_pivot = WA.pivot(index = 'O*NET-SOC Code', columns='Element_pivot', values='Data Value').fillna(0)
# WA_pivot.reset_index(inplace=True)

# WA_pivot

# occupation_level_skills_wa = pd.merge(df_skills, WA_pivot,  how='left', left_on=['O*NET-SOC Code'], right_on = ['O*NET-SOC Code'])
# occupation_level_skills_wa = occupation_level_skills_wa.sort_values(by = 'Observed Occupation')
# occupation_level_skills_wa

In [17]:
know = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Knowledge.txt'), sep='\t', low_memory=False)
know = know[know['Scale ID'] == 'IM']
# know['Element_pivot'] =  know['Element Name'] + " " +  know['Scale ID']
know_pivot = know.pivot(index = 'O*NET-SOC Code', columns='Element Name', values='Data Value').fillna(0)
know_pivot.reset_index(inplace=True)
print know_pivot.shape
know_pivot.head()

(964, 34)


Element Name,O*NET-SOC Code,Administration and Management,Biology,Building and Construction,Chemistry,Clerical,Communications and Media,Computers and Electronics,Customer and Personal Service,Design,...,Philosophy and Theology,Physics,Production and Processing,Psychology,Public Safety and Security,Sales and Marketing,Sociology and Anthropology,Telecommunications,Therapy and Counseling,Transportation
0,11-1011.00,4.75,1.34,2.11,1.43,2.66,2.7,2.23,4.09,2.05,...,1.7,1.23,2.63,3.1,3.3,3.23,2.61,1.76,2.0,2.21
1,11-1011.03,3.85,2.44,3.69,2.36,2.58,2.84,2.65,3.62,3.72,...,1.85,2.48,2.23,2.88,2.4,3.5,2.38,1.58,1.23,2.42
2,11-1021.00,4.35,1.5,2.51,1.95,3.51,2.59,3.33,3.95,2.38,...,1.51,1.81,3.39,2.72,3.1,3.47,1.81,2.5,1.77,2.2
3,11-2011.00,4.11,1.11,1.12,1.09,3.1,4.33,3.43,3.79,2.94,...,1.47,1.06,3.12,2.63,1.78,3.88,1.73,3.06,1.28,1.61
4,11-2021.00,4.04,1.1,1.19,1.21,3.01,3.8,3.51,3.85,3.15,...,1.64,1.18,2.46,2.45,2.5,4.85,2.03,2.86,1.63,1.68


In [18]:
abilities = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Abilities.txt'), sep='\t', low_memory=False)
abilities = abilities[abilities['Scale ID'] == 'IM']
#abilities['Element_pivot'] =  abilities['Element Name'] + " " +  abilities['Scale ID']
abilities_pivot = abilities.pivot(index = 'O*NET-SOC Code', columns='Element Name', values='Data Value').fillna(0)
abilities_pivot.reset_index(inplace=True)
print abilities_pivot.shape
abilities_pivot.head()


(964, 53)


Element Name,O*NET-SOC Code,Arm-Hand Steadiness,Auditory Attention,Category Flexibility,Control Precision,Deductive Reasoning,Depth Perception,Dynamic Flexibility,Dynamic Strength,Explosive Strength,...,Speed of Limb Movement,Stamina,Static Strength,Time Sharing,Trunk Strength,Visual Color Discrimination,Visualization,Wrist-Finger Speed,Written Comprehension,Written Expression
0,11-1011.00,1.0,2.12,3.5,1.75,4.12,1.75,1.0,1.0,1.0,...,1.0,1.0,1.0,3.0,1.0,1.88,3.12,1.0,4.25,4.12
1,11-1011.03,1.0,1.88,3.38,1.75,4.0,2.0,1.0,1.0,1.0,...,1.0,1.0,1.0,2.62,1.12,2.0,2.75,1.12,4.0,3.88
2,11-1021.00,2.0,2.12,3.0,1.75,3.75,2.0,1.0,1.62,1.5,...,1.5,2.0,2.0,2.88,2.12,2.0,2.75,1.38,4.0,4.0
3,11-2011.00,1.88,1.88,3.38,1.5,3.88,1.88,1.0,1.0,1.0,...,1.0,1.0,1.0,2.75,1.25,2.88,3.0,1.25,3.88,3.88
4,11-2021.00,1.12,1.88,3.25,1.0,3.88,1.75,1.0,1.25,1.0,...,1.0,1.0,1.0,2.75,1.75,2.88,3.0,1.62,4.0,3.88



# Skills by Occupations and Tasks 

In [19]:
#access the occupation skills vector like this:
# skills_pivot[skills_pivot['O*NET-SOC Code'] == '11-1011.00'].values[0][1:]

## Weight Skills by Importance of Task and Frequency of Task

In [20]:
# # Do Not Normalise by Occupation (rarther by DWA - above)
# task_im_sum = task_im.groupby('O*NET-SOC Code').sum().reset_index()
# task_im_sum.rename(columns = {'Task IM': 'IM Sum per Occu'}, inplace= True)
# task_im_sum = task_im_sum[['O*NET-SOC Code', 'IM Sum per Occu']]

# task_ims = pd.merge(task_im, task_im_sum,  how='left', left_on=['O*NET-SOC Code'], right_on = ['O*NET-SOC Code'])
# task_ims['Task IM Norm'] = task_ims['Task IM'] / task_ims['IM Sum per Occu'] 
# task_ims

In [21]:
# # Do Not Use Frequency: Bit hacky. 

# task_freq = task_rates[task_rates.loc[:,('Scale ID')] == 'FT']

# # Manually change Frequency Categories into Numeric value per Day ## A bit hacky :) 
# time_categories = {'1' : (1/365.),  # Yearly or less
#                    '2' : (4/365.),  # More than yearly
#                    '3' : (3/12.),   # More than monthly
#                    '4' : (3/7.),    # More than weekly
#                    '5' : 1.,            # Daily
#                    '6' : 3.,            # Several times daily
#                    '7' : 8.}            # Hourly or more

# # frequency is only relative:
# task_freq.loc[:,('Temp')] = [float(time_categories[i]) for i in task_freq.loc[:,('Category')].values]
# task_freq.loc[:,('Freq')] = task_freq.loc[:,('Temp')]*task_freq.loc[:,('Data Value')]

In [22]:
# by = ['O*NET-SOC Code', 'Task ID']
# task_freq = task_freq.groupby(by).mean().reset_index()
# task_freq.rename(columns = {'Freq':'Task Freq'}, inplace = True)
# cols = by + ['Task Freq']
# task_freq = task_freq[cols]

# task_freq_norm = task_freq.groupby('O*NET-SOC Code').sum().reset_index()
# task_freq_norm.rename(columns = {'Task Freq': 'Sum per Occu'}, inplace= True)
# task_freq_norm = task_freq_norm[['O*NET-SOC Code', 'Sum per Occu']]

# task_freqs = pd.merge(task_freq, task_freq_norm,  how='left', left_on=['O*NET-SOC Code'], right_on = ['O*NET-SOC Code'])
# task_freqs.loc[:, ('Task Freq Norm')] = task_freqs.loc[:, ('Task Freq')] / task_freqs.loc[:, ('Sum per Occu')]

# task_freqs.groupby('O*NET-SOC Code').sum() # check they sum to 1 :) 
# task_freqs[['O*NET-SOC Code', 'Task ID', 'Task Freq', 'Task Freq Norm']]

# Merge 120 features together

In [23]:
# print skills_pivot.shape[1] + know_pivot.shape[1] + abilities_pivot.shape[1]
all_features = skills_pivot.merge(know_pivot,on='O*NET-SOC Code').merge(abilities_pivot,on='O*NET-SOC Code')
print all_features.shape
all_features.head()

(964, 121)


Element Name,O*NET-SOC Code,Active Learning,Active Listening,Complex Problem Solving,Coordination,Critical Thinking,Equipment Maintenance,Equipment Selection,Installation,Instructing,...,Speed of Limb Movement,Stamina,Static Strength,Time Sharing,Trunk Strength,Visual Color Discrimination,Visualization,Wrist-Finger Speed,Written Comprehension,Written Expression
0,11-1011.00,4.0,4.12,4.38,4.25,4.38,1.0,1.12,1.0,3.12,...,1.0,1.0,1.0,3.0,1.0,1.88,3.12,1.0,4.25,4.12
1,11-1011.03,3.5,3.88,4.0,3.62,4.0,1.0,1.12,1.0,3.25,...,1.0,1.0,1.0,2.62,1.12,2.0,2.75,1.12,4.0,3.88
2,11-1021.00,3.5,4.0,3.5,4.0,3.88,1.0,1.0,1.0,3.12,...,1.5,2.0,2.0,2.88,2.12,2.0,2.75,1.38,4.0,4.0
3,11-2011.00,3.25,4.0,3.5,3.5,3.75,1.0,1.25,1.0,2.88,...,1.0,1.0,1.0,2.75,1.25,2.88,3.0,1.25,3.88,3.88
4,11-2021.00,3.88,3.88,3.62,3.5,3.88,1.0,1.0,1.0,3.0,...,1.0,1.0,1.0,2.75,1.75,2.88,3.0,1.62,4.0,3.88


# Merge Occupancy Features with Weightings

In [24]:
cols = ['O*NET-SOC Code', 'DWA ID', 'Task IM per DWA Weight', 'Task IM per Occu Weight'] #, 'Employment per DWA Norm']
df6 = pd.merge(df4[cols], all_features,  how='left', left_on=['O*NET-SOC Code'], right_on = ['O*NET-SOC Code'])

# print df6[df6['O*NET-SOC Code'] == '11-2022.00']['Task IM per Occu Norm'].sum()  # Check the IM weights sum to 1 over Occu
print df6.shape
df6.head()

(22365, 124)


Unnamed: 0,O*NET-SOC Code,DWA ID,Task IM per DWA Weight,Task IM per Occu Weight,Active Learning,Active Listening,Complex Problem Solving,Coordination,Critical Thinking,Equipment Maintenance,...,Speed of Limb Movement,Stamina,Static Strength,Time Sharing,Trunk Strength,Visual Color Discrimination,Visualization,Wrist-Finger Speed,Written Comprehension,Written Expression
0,11-2022.00,4.A.4.a.8.I03.D05,0.048265,0.060121,3.75,4.0,3.75,3.88,3.88,1.0,...,1.12,1.0,1.0,2.62,1.75,1.88,2.38,1.5,4.0,3.88
1,11-2022.00,4.A.1.a.1.I14.D04,0.105248,0.052477,3.75,4.0,3.75,3.88,3.88,1.0,...,1.12,1.0,1.0,2.62,1.75,1.88,2.38,1.5,4.0,3.88
2,11-2022.00,4.A.4.b.4.I12.D03,0.074453,0.062031,3.75,4.0,3.75,3.88,3.88,1.0,...,1.12,1.0,1.0,2.62,1.75,1.88,2.38,1.5,4.0,3.88
3,11-2022.00,4.A.2.b.4.I01.D06,0.125522,0.053065,3.75,4.0,3.75,3.88,3.88,1.0,...,1.12,1.0,1.0,2.62,1.75,1.88,2.38,1.5,4.0,3.88
4,11-2022.00,4.A.2.a.4.I11.D06,0.219566,0.058063,3.75,4.0,3.75,3.88,3.88,1.0,...,1.12,1.0,1.0,2.62,1.75,1.88,2.38,1.5,4.0,3.88


In [25]:
## Weight each skills vector by the Task Importance per DWA: 
func = lambda x: np.asarray(x) * np.asarray(df6['Task IM per DWA Weight'])
df7 = df6[df6.columns[4:]].apply(func)
df7

Unnamed: 0,Active Learning,Active Listening,Complex Problem Solving,Coordination,Critical Thinking,Equipment Maintenance,Equipment Selection,Installation,Instructing,Judgment and Decision Making,...,Speed of Limb Movement,Stamina,Static Strength,Time Sharing,Trunk Strength,Visual Color Discrimination,Visualization,Wrist-Finger Speed,Written Comprehension,Written Expression
0,0.180995,0.193061,0.180995,0.187269,0.187269,0.048265,0.048265,0.048265,0.174720,0.180995,...,0.054057,0.048265,0.048265,0.126455,0.084464,0.090739,0.114871,0.072398,0.193061,0.187269
1,0.394679,0.420991,0.394679,0.408361,0.408361,0.105248,0.105248,0.105248,0.380996,0.394679,...,0.117877,0.105248,0.105248,0.275749,0.184183,0.197866,0.250489,0.157871,0.420991,0.408361
2,0.279199,0.297812,0.279199,0.288878,0.288878,0.074453,0.074453,0.074453,0.269520,0.279199,...,0.083387,0.074453,0.074453,0.195067,0.130293,0.139972,0.177198,0.111680,0.297812,0.288878
3,0.470706,0.502086,0.470706,0.487024,0.487024,0.125522,0.125522,0.125522,0.454388,0.470706,...,0.140584,0.125522,0.125522,0.328866,0.219663,0.235981,0.298741,0.188282,0.502086,0.487024
4,0.823374,0.878266,0.823374,0.851918,0.851918,0.219566,0.219566,0.219566,0.794830,0.823374,...,0.245914,0.219566,0.219566,0.575264,0.384241,0.412785,0.522568,0.329350,0.878266,0.851918
5,0.230240,0.245589,0.230240,0.238222,0.238222,0.061397,0.061397,0.061397,0.222258,0.230240,...,0.068765,0.061397,0.061397,0.160861,0.107445,0.115427,0.146126,0.092096,0.245589,0.238222
6,0.106126,0.113201,0.106126,0.109805,0.109805,0.028300,0.028300,0.028300,0.102447,0.106126,...,0.031696,0.028300,0.028300,0.074147,0.049525,0.053204,0.067355,0.042450,0.113201,0.109805
7,0.090757,0.096808,0.090757,0.093903,0.093903,0.024202,0.024202,0.024202,0.087611,0.090757,...,0.027106,0.024202,0.024202,0.063409,0.042353,0.045500,0.057601,0.036303,0.096808,0.093903
8,0.253340,0.270230,0.253340,0.262123,0.262123,0.067557,0.067557,0.067557,0.244558,0.253340,...,0.075664,0.067557,0.067557,0.177000,0.118225,0.127008,0.160787,0.101336,0.270230,0.262123
9,0.140249,0.149599,0.140249,0.145111,0.145111,0.037400,0.037400,0.037400,0.135387,0.140249,...,0.041888,0.037400,0.037400,0.097987,0.065450,0.070311,0.089011,0.056100,0.149599,0.145111


In [26]:
## Further Weight each skills vector by the Task - Occupation Weight
func = lambda x: np.asarray(x) * np.asarray(df6['Task IM per Occu Weight'])
df7_o = df7.apply(func)
df7_o

Unnamed: 0,Active Learning,Active Listening,Complex Problem Solving,Coordination,Critical Thinking,Equipment Maintenance,Equipment Selection,Installation,Instructing,Judgment and Decision Making,...,Speed of Limb Movement,Stamina,Static Strength,Time Sharing,Trunk Strength,Visual Color Discrimination,Visualization,Wrist-Finger Speed,Written Comprehension,Written Expression
0,0.010882,0.011607,0.010882,0.011259,0.011259,0.002902,0.002902,0.002902,0.010504,0.010882,...,0.003250,0.002902,0.002902,0.007603,0.005078,0.005455,0.006906,0.004353,0.011607,0.011259
1,0.020711,0.022092,0.020711,0.021429,0.021429,0.005523,0.005523,0.005523,0.019993,0.020711,...,0.006186,0.005523,0.005523,0.014470,0.009665,0.010383,0.013145,0.008285,0.022092,0.021429
2,0.017319,0.018474,0.017319,0.017920,0.017920,0.004618,0.004618,0.004618,0.016719,0.017319,...,0.005173,0.004618,0.004618,0.012100,0.008082,0.008683,0.010992,0.006928,0.018474,0.017920
3,0.024978,0.026643,0.024978,0.025844,0.025844,0.006661,0.006661,0.006661,0.024112,0.024978,...,0.007460,0.006661,0.006661,0.017451,0.011656,0.012522,0.015853,0.009991,0.026643,0.025844
4,0.047807,0.050994,0.047807,0.049465,0.049465,0.012749,0.012749,0.012749,0.046150,0.047807,...,0.014278,0.012749,0.012749,0.033401,0.022310,0.023967,0.030342,0.019123,0.050994,0.049465
5,0.011778,0.012563,0.011778,0.012186,0.012186,0.003141,0.003141,0.003141,0.011369,0.011778,...,0.003518,0.003141,0.003141,0.008229,0.005496,0.005905,0.007475,0.004711,0.012563,0.012186
6,0.005070,0.005408,0.005070,0.005246,0.005246,0.001352,0.001352,0.001352,0.004894,0.005070,...,0.001514,0.001352,0.001352,0.003542,0.002366,0.002542,0.003218,0.002028,0.005408,0.005246
7,0.004349,0.004639,0.004349,0.004500,0.004500,0.001160,0.001160,0.001160,0.004198,0.004349,...,0.001299,0.001160,0.001160,0.003039,0.002030,0.002180,0.002760,0.001740,0.004639,0.004500
8,0.013369,0.014260,0.013369,0.013832,0.013832,0.003565,0.003565,0.003565,0.012906,0.013369,...,0.003993,0.003565,0.003565,0.009340,0.006239,0.006702,0.008485,0.005348,0.014260,0.013832
9,0.007401,0.007894,0.007401,0.007658,0.007658,0.001974,0.001974,0.001974,0.007144,0.007401,...,0.002210,0.001974,0.001974,0.005171,0.003454,0.003710,0.004697,0.002960,0.007894,0.007658


In [27]:
# ## Further Weight each skills vector by the Employment Norm: 
# func = lambda x: np.asarray(x) * np.asarray(df6['Employment per DWA Norm'])
# df7_e = df7.apply(func)

In [28]:
df8 = pd.merge(df6[['DWA ID']], df7, how='left', left_index=True, right_index=True)
df8_o = pd.merge(df6[['DWA ID']], df7_o, how='left', left_index=True, right_index=True)

In [29]:
df9 = df8.groupby(['DWA ID'], as_index=False).mean()
df9_o = df8_o.groupby(['DWA ID'], as_index=False).mean()
print df9.shape
df9.head()

(2067, 121)


Unnamed: 0,DWA ID,Active Learning,Active Listening,Complex Problem Solving,Coordination,Critical Thinking,Equipment Maintenance,Equipment Selection,Installation,Instructing,...,Speed of Limb Movement,Stamina,Static Strength,Time Sharing,Trunk Strength,Visual Color Discrimination,Visualization,Wrist-Finger Speed,Written Comprehension,Written Expression
0,4.A.1.a.1.I01.D01,0.640012,0.760882,0.661468,0.650813,0.689551,0.272503,0.324083,0.211035,0.571916,...,0.216773,0.238842,0.255173,0.505297,0.296222,0.670489,0.706263,0.287415,0.754158,0.662799
1,4.A.1.a.1.I01.D02,0.770415,0.885816,0.780493,0.830887,0.856852,0.310548,0.344312,0.322658,0.762447,...,0.31094,0.359645,0.394422,0.646531,0.474791,0.532756,0.694413,0.473643,0.814686,0.805713
2,4.A.1.a.1.I01.D03,0.476539,0.535567,0.450439,0.52,0.508739,0.166667,0.277673,0.166667,0.431036,...,0.304242,0.353333,0.345064,0.378837,0.425406,0.548142,0.490909,0.318964,0.5,0.48
3,4.A.1.a.1.I01.D04,0.360029,0.433351,0.354674,0.392718,0.403872,0.123165,0.138594,0.119341,0.334015,...,0.160733,0.193622,0.190626,0.307669,0.217988,0.280556,0.338411,0.137912,0.424902,0.382535
4,4.A.1.a.1.I02.D01,0.92898,1.199915,0.914062,1.022204,1.080387,0.333333,0.360471,0.333333,0.75989,...,0.408454,0.454496,0.469035,0.871552,0.56202,0.65247,0.720315,0.639633,1.345932,1.246505


In [30]:
df9_o.head()
# df9_o[df9_o['DWA ID']== '4.A.1.a.1.I01.D01']

Unnamed: 0,DWA ID,Active Learning,Active Listening,Complex Problem Solving,Coordination,Critical Thinking,Equipment Maintenance,Equipment Selection,Installation,Instructing,...,Speed of Limb Movement,Stamina,Static Strength,Time Sharing,Trunk Strength,Visual Color Discrimination,Visualization,Wrist-Finger Speed,Written Comprehension,Written Expression
0,4.A.1.a.1.I01.D01,0.031959,0.037869,0.03298,0.032447,0.03455,0.013478,0.015813,0.010403,0.028335,...,0.010628,0.011497,0.01214,0.025129,0.014471,0.033422,0.035094,0.014092,0.037763,0.03332
1,4.A.1.a.1.I01.D02,0.03828,0.044085,0.038099,0.041026,0.041788,0.013809,0.014998,0.014102,0.037538,...,0.016027,0.017805,0.020243,0.032181,0.024583,0.025346,0.03447,0.025029,0.039408,0.038932
2,4.A.1.a.1.I01.D03,0.022385,0.025084,0.021186,0.024333,0.023848,0.007799,0.013124,0.007799,0.020191,...,0.014343,0.016534,0.016105,0.017793,0.019948,0.025779,0.023078,0.014905,0.023398,0.022462
3,4.A.1.a.1.I01.D04,0.019068,0.023058,0.018762,0.020767,0.021303,0.006421,0.007015,0.006198,0.017778,...,0.008924,0.010822,0.010003,0.016285,0.011724,0.01444,0.017394,0.007203,0.022595,0.02022
4,4.A.1.a.1.I02.D01,0.041364,0.052658,0.040827,0.04493,0.048182,0.014923,0.016167,0.014923,0.033411,...,0.019076,0.021621,0.022425,0.038445,0.026279,0.029148,0.032114,0.028996,0.060132,0.055424


In [31]:
# # # Version 0.1 : 
# A defensible method for weighting Importance and Frequency of Skills vector per Occu 

# weighted_vecs = np.array([])
# weighted_vecs.shape
# problem_onet_codes = {"skill_vec":set([]), "IM/Freq" : set([]) }

# # try and get rid of this horrible loop. 
# for (cnt, row) in df3.iterrows():
#     if cnt%1000 ==0: print cnt,
    
#     #if task_freq['O*NET-SOC Code'].str.contains(row['O*NET-SOC Code']).any() and task_ims['O*NET-SOC Code'].str.contains(row['O*NET-SOC Code']).any() and skills_pivot['O*NET-SOC Code'].str.contains(row['O*NET-SOC Code']).any():
       
#     try: 
#         skill_vec = skills_pivot[skills_pivot['O*NET-SOC Code'] == row['O*NET-SOC Code']].values[0][1:]
#     except IndexError as e:
# #         print cnt, row['O*NET-SOC Code'], skills_pivot[skills_pivot['O*NET-SOC Code'] == row['O*NET-SOC Code']].values
#         problem_onet_codes["skill_vec"].add(row['O*NET-SOC Code'])
#         continue

#     freq = task_freqs[task_freqs['Task ID'] == row['Task ID']]['Task Freq Norm'].values
#     imp =  task_ims[task_ims['Task ID'] == row['Task ID']]['Task IM Norm'].values

#     try:
#         vec = skill_vec*freq*imp
#     except ValueError as e:
# #         print row['O*NET-SOC Code']
#         problem_onet_codes["IM/Freq"].add(row['O*NET-SOC Code'])
#         continue

#     task_info = np.append(row['Task ID'], row['DWA ID'])
#     weigted_skills = np.append(task_info, vec)

#     try: 
#         weighted_vecs = np.append(weighted_vecs, [weigted_skills], axis=0)
#     except:
#         weighted_vecs = np.array([weigted_skills])

# weighted_vecs.shape

In [32]:
# df4 = pd.DataFrame(data=weighted_vecs)
# task_cols = ['Task ID', 'DWA ID']

# skill_column_names = list(skills_pivot.columns[1:])
# df4.columns = np.append(task_cols, skill_column_names)
# skill_column_names
# df4[skill_column_names] = df4[skill_column_names].apply(pd.to_numeric)
# df4.head()

In [33]:
# # Take the mean average over occupations DWAs
# df5 = df4.groupby(['DWA ID'], as_index=False).mean()
# df5.head()

## Ground Truth Survey data:

In [48]:
survey_data_.rename(columns = {'title':'O*NET Occupation title', 
                              'Unnamed: 0': 'Task'}, inplace = True)

# Change Ordinal Data to Numeric - bit hacky
ratings = [4,3,2,1,0]
survey_data_['GT Rating'] = (survey_data_['Completely Automatable Today']*ratings[0] + survey_data_['Could be Mostly Automated Today (Human Still Needed)']*ratings[1] + survey_data_['Mostly Not Automatable Today (Human Does Most of It)']*ratings[2] + survey_data_['Not Automatable Today']*ratings[3] + survey_data_['Unsure']*ratings[4]) / survey_data_['Number of Responses']                
                
survey_data = survey_data_[['Task ID', 'GT Rating']].sort_values(by='Task ID')
print survey_data['GT Rating'].sum()

#Task DWAs (detailed work activitiy code):
taskDWA = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Tasks to DWAs.txt'), sep='\t')
taskDWA = taskDWA[['Task ID', 'DWA ID']]

print taskDWA.shape, "UNIQUE DWA: ", len(taskDWA['DWA ID'].unique()) 
task_dwa_rat = pd.merge(taskDWA, survey_data, how='left', left_on=['Task ID'], right_on = ['Task ID'])

DWA_mean_rating = task_dwa_rat.groupby(['DWA ID']).mean().reset_index().rename(columns = {'GT Rating':'DWA GT Rating'})
DWA_mean_rating = DWA_mean_rating[['DWA ID', 'DWA GT Rating']]

print "Unique DWAs Annotated = ", DWA_mean_rating[DWA_mean_rating['DWA GT Rating'].notnull()].shape

data = pd.merge(df9, DWA_mean_rating,  how='left', left_on=['DWA ID'], right_on = ['DWA ID'])
data.head()

data_o = pd.merge(df9_o, DWA_mean_rating,  how='left', left_on=['DWA ID'], right_on = ['DWA ID'])
data_o.head()

922.543188756
(22838, 2) UNIQUE DWA:  2070
Unique DWAs Annotated =  (314, 2)


Unnamed: 0,DWA ID,Active Learning,Active Listening,Complex Problem Solving,Coordination,Critical Thinking,Equipment Maintenance,Equipment Selection,Installation,Instructing,...,Stamina,Static Strength,Time Sharing,Trunk Strength,Visual Color Discrimination,Visualization,Wrist-Finger Speed,Written Comprehension,Written Expression,DWA GT Rating
0,4.A.1.a.1.I01.D01,0.031959,0.037869,0.03298,0.032447,0.03455,0.013478,0.015813,0.010403,0.028335,...,0.011497,0.01214,0.025129,0.014471,0.033422,0.035094,0.014092,0.037763,0.03332,2.3125
1,4.A.1.a.1.I01.D02,0.03828,0.044085,0.038099,0.041026,0.041788,0.013809,0.014998,0.014102,0.037538,...,0.017805,0.020243,0.032181,0.024583,0.025346,0.03447,0.025029,0.039408,0.038932,
2,4.A.1.a.1.I01.D03,0.022385,0.025084,0.021186,0.024333,0.023848,0.007799,0.013124,0.007799,0.020191,...,0.016534,0.016105,0.017793,0.019948,0.025779,0.023078,0.014905,0.023398,0.022462,
3,4.A.1.a.1.I01.D04,0.019068,0.023058,0.018762,0.020767,0.021303,0.006421,0.007015,0.006198,0.017778,...,0.010822,0.010003,0.016285,0.011724,0.01444,0.017394,0.007203,0.022595,0.02022,2.153846
4,4.A.1.a.1.I02.D01,0.041364,0.052658,0.040827,0.04493,0.048182,0.014923,0.016167,0.014923,0.033411,...,0.021621,0.022425,0.038445,0.026279,0.029148,0.032114,0.028996,0.060132,0.055424,


In [55]:
cols = np.append(['DWA ID', 'DWA GT Rating'], all_features.columns[1:])

X = data[data['DWA GT Rating'].notnull()]
test = data[data['DWA GT Rating'].isnull()].reset_index()[cols]
y = X['DWA GT Rating']


X_o = data_o[data_o['DWA GT Rating'].notnull()]
test_o = data_o[data_o['DWA GT Rating'].isnull()].reset_index()[cols]
y_o = X['DWA GT Rating']
# X_o.head()
# test_o
X

Unnamed: 0,DWA ID,Active Learning,Active Listening,Complex Problem Solving,Coordination,Critical Thinking,Equipment Maintenance,Equipment Selection,Installation,Instructing,...,Stamina,Static Strength,Time Sharing,Trunk Strength,Visual Color Discrimination,Visualization,Wrist-Finger Speed,Written Comprehension,Written Expression,DWA GT Rating
0,4.A.1.a.1.I01.D01,0.640012,0.760882,0.661468,0.650813,0.689551,0.272503,0.324083,0.211035,0.571916,...,0.238842,0.255173,0.505297,0.296222,0.670489,0.706263,0.287415,0.754158,0.662799,2.312500
3,4.A.1.a.1.I01.D04,0.360029,0.433351,0.354674,0.392718,0.403872,0.123165,0.138594,0.119341,0.334015,...,0.193622,0.190626,0.307669,0.217988,0.280556,0.338411,0.137912,0.424902,0.382535,2.153846
11,4.A.1.a.1.I02.D08,0.165224,0.190542,0.179800,0.163266,0.191568,0.088022,0.098921,0.072981,0.148719,...,0.070115,0.076516,0.142507,0.089990,0.149631,0.173329,0.090978,0.201805,0.187325,3.333333
12,4.A.1.a.1.I02.D09,0.117664,0.134487,0.125536,0.133918,0.138725,0.095034,0.103691,0.075705,0.109398,...,0.120694,0.133348,0.114190,0.140046,0.119192,0.141038,0.087943,0.128867,0.112453,2.600000
13,4.A.1.a.1.I02.D10,0.188119,0.222286,0.204664,0.210808,0.223678,0.142135,0.116065,0.081526,0.177424,...,0.153741,0.173827,0.186136,0.170600,0.187453,0.194365,0.136334,0.217571,0.204275,3.059091
16,4.A.1.a.1.I02.D13,0.240061,0.269917,0.268174,0.247225,0.282685,0.283162,0.258564,0.213062,0.209574,...,0.213063,0.247956,0.218444,0.253488,0.259148,0.274576,0.203818,0.259526,0.230523,2.538462
27,4.A.1.a.1.I04.D01,0.679739,0.776000,0.600000,0.699363,0.730543,0.200000,0.200000,0.200000,0.529906,...,0.200000,0.200000,0.455102,0.424000,0.454073,0.489718,0.274637,0.754543,0.685645,3.666667
36,4.A.1.a.1.I05.D02,0.252705,0.304196,0.263061,0.218169,0.295356,0.071429,0.071429,0.071429,0.196215,...,0.077515,0.080894,0.187749,0.088065,0.140849,0.147460,0.081165,0.302154,0.288126,2.461538
38,4.A.1.a.1.I06.D02,0.626338,0.691111,0.646708,0.600964,0.707074,0.659394,0.595336,0.572382,0.571014,...,0.567961,0.616697,0.548040,0.625483,0.657994,0.640232,0.419688,0.646182,0.575399,2.470588
59,4.A.1.a.1.I12.D04,0.496527,0.642743,0.520056,0.511511,0.565857,0.142857,0.145458,0.142857,0.463834,...,0.244172,0.253937,0.393782,0.237747,0.280362,0.309393,0.145253,0.571367,0.560712,1.833333


# Save Data to file

- With and without Employment Weighting

In [50]:
## Uncomment to pickle somewhere: 

save_these = [(X, test, y), (X_o, test_o, y_o)]
file_names = ['tasks_by_skills_2018_v3.p', 'tasks_by_skills_2018_v3_o.p']

for file_name, save in zip(file_names, save_these):
     
    path = os.path.join(datasets, 'FoEmployment/Analysis_of_ONET_Tasks', file_name)
    f = open(path, "w")
    pickle.dump(save, f)
    f.close()
    print "saved here: %s " % path

saved here: /home/scpd/Datasets/FoEmployment/Analysis_of_ONET_Tasks/tasks_by_skills_2018_v3.p 
saved here: /home/scpd/Datasets/FoEmployment/Analysis_of_ONET_Tasks/tasks_by_skills_2018_v3_o.p 
