Map ONET Skills onto Tasks via Occupations Dataset
---

By Paul Duckworth 17th Nov 2017.

Create a skills/abilies vector per (DWA) task from ONET datasets: Skills, Abilities, Occupations, Tasks, DWAs


In [1]:
import os
import numpy as np
import pandas as pd
import pandas_ml as pdml
import getpass
import matplotlib.pyplot as plt
from random import shuffle
%matplotlib inline

datasets = '/home/'+ getpass.getuser() +'/Datasets/'
print datasets


/home/scpd/Datasets/


# ONET Datasets:

## Occupations and Tasks 

In [11]:
occupations = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Occupation Data.txt'), sep='\t')
occupations.rename(columns = {'Title':'Occupation title'}, inplace = True)
occupations.head()

Unnamed: 0,O*NET-SOC Code,Occupation title,Description
0,11-1011.00,Chief Executives,Determine and formulate policies and provide o...
1,11-1011.03,Chief Sustainability Officers,"Communicate and coordinate with management, sh..."
2,11-1021.00,General and Operations Managers,"Plan, direct, or coordinate the operations of ..."
3,11-1031.00,Legislators,"Develop, introduce or enact laws and statutes ..."
4,11-2011.00,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici..."


In [68]:
tasks = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Task Statements.txt'), sep='\t')
tasks = tasks[['O*NET-SOC Code', 'Task ID', 'Task']]

reduce_tasks = tasks['O*NET-SOC Code'].unique()[:2]
tasks = tasks[tasks['O*NET-SOC Code'].isin(reduce_tasks)]      # reduce the task matrix for now :)
tasks.head()

Unnamed: 0,O*NET-SOC Code,Task ID,Task
0,11-1011.00,8823,Direct or coordinate an organization's financi...
1,11-1011.00,8831,Appoint department heads or managers and assig...
2,11-1011.00,8825,Analyze operations to evaluate performance of ...
3,11-1011.00,8826,"Direct, plan, or implement policies, objective..."
4,11-1011.00,8827,"Prepare budgets for approval, including those ..."


In [69]:
#Task DWAs (detailed work activitiy code):
taskDWA = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Tasks to DWAs.txt'), sep='\t')
taskDWA = taskDWA[['Task ID', 'DWA ID']]

print taskDWA.shape, "UNIQUE DWA: ", len(taskDWA['DWA ID'].unique()) 
taskDWA.head()

(22838, 2) UNIQUE DWA:  2070


Unnamed: 0,Task ID,DWA ID
0,20461,4.A.2.a.4.I09.D03
1,20461,4.A.4.b.6.I08.D04
2,8823,4.A.4.b.4.I09.D02
3,8824,4.A.4.a.2.I03.D14
4,8825,4.A.2.a.4.I07.D09


In [88]:
df = pd.merge(tasks, taskDWA,  how='left', left_on=['Task ID'], right_on = ['Task ID']).sort_values(by = 'Task ID')
df = df[df['DWA ID'].notnull()]
df['IWA ID'] = df['DWA ID'].str.slice(0,-4)    # create IWA ID
df['WA ID'] = df['DWA ID'].str.slice(0,-8)     # create WA ID

## ADD DWA and IWA titles:
DWAref = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/DWA Reference.txt'), sep='\t')[['DWA ID', 'DWA Title']]
df2 = pd.merge(df, DWAref,  how='left', left_on=['DWA ID'], right_on = ['DWA ID'])

IWAref = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/IWA Reference.txt'), sep='\t')[['IWA ID', 'IWA Title']]
df3 = pd.merge(df2, IWAref,  how='left', left_on=['IWA ID'], right_on = ['IWA ID'])

# cols = ['Task ID', 'Task', 'DWA ID', 'DWA Title', 'IWA ID', 'IWA Title', 'WA ID']
cols = ['O*NET-SOC Code', 'Task ID', 'DWA ID', 'IWA ID', 'WA ID']
df3 = df3[cols]

print df3.shape
df3.head()

(56, 5)


Unnamed: 0,O*NET-SOC Code,Task ID,DWA ID,IWA ID,WA ID
0,11-1011.00,8823,4.A.4.b.4.I09.D02,4.A.4.b.4.I09,4.A.4.b.4
1,11-1011.00,8824,4.A.4.a.2.I03.D14,4.A.4.a.2.I03,4.A.4.a.2
2,11-1011.00,8825,4.A.2.a.4.I07.D09,4.A.2.a.4.I07,4.A.2.a.4
3,11-1011.00,8826,4.A.2.b.1.I09.D01,4.A.2.b.1.I09,4.A.2.b.1
4,11-1011.00,8826,4.A.2.b.4.I01.D01,4.A.2.b.4.I01,4.A.2.b.4


## Skills by Occupations

In [89]:

skills = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Skills.txt'), sep='\t', low_memory=False)
skills.head()

skills['Element_pivot'] =  skills['Element Name'] + " " +  skills['Scale ID']
skills_pivot = skills.pivot(index = 'O*NET-SOC Code', columns='Element_pivot', values='Data Value').fillna(0)
skills_pivot.reset_index(inplace=True)
skills_pivot


Element_pivot,O*NET-SOC Code,Active Learning IM,Active Learning LV,Active Listening IM,Active Listening LV,Complex Problem Solving IM,Complex Problem Solving LV,Coordination IM,Coordination LV,Critical Thinking IM,...,Systems Evaluation IM,Systems Evaluation LV,Technology Design IM,Technology Design LV,Time Management IM,Time Management LV,Troubleshooting IM,Troubleshooting LV,Writing IM,Writing LV
0,11-1011.00,4.00,4.75,4.12,4.88,4.38,5.00,4.25,5.12,4.38,...,4.12,5.12,1.75,0.75,4.00,4.75,1.00,0.00,4.00,4.38
1,11-1011.03,3.50,3.75,3.88,4.12,4.00,4.25,3.62,3.75,4.00,...,3.62,3.75,1.62,1.00,3.38,3.50,1.12,0.12,3.88,4.38
2,11-1021.00,3.50,3.62,4.00,4.00,3.50,3.75,4.00,3.88,3.88,...,3.00,3.12,1.88,1.12,3.75,3.75,2.00,1.38,3.25,3.88
3,11-2011.00,3.25,4.00,4.00,4.12,3.50,4.00,3.50,4.38,3.75,...,3.00,3.75,1.62,0.88,3.88,4.00,1.12,0.12,3.75,3.88
4,11-2021.00,3.88,4.12,3.88,4.12,3.62,3.88,3.50,3.75,3.88,...,3.50,3.75,1.75,0.88,3.50,3.75,1.00,0.00,3.25,3.88
5,11-2022.00,3.75,3.88,4.00,4.00,3.75,3.88,3.88,4.12,3.88,...,3.62,3.88,1.75,0.88,3.50,3.88,1.00,0.00,3.62,4.00
6,11-2031.00,3.25,3.88,4.25,4.25,3.62,4.00,3.75,4.38,3.75,...,3.50,3.75,1.50,0.75,3.62,3.75,1.00,0.00,4.12,4.38
7,11-3011.00,3.12,3.38,3.88,3.75,3.12,3.25,3.75,3.75,3.50,...,2.75,3.00,1.38,0.50,3.62,3.62,1.75,1.00,3.62,3.50
8,11-3021.00,3.38,3.88,4.00,4.00,3.75,3.88,3.75,3.88,4.12,...,3.62,3.88,2.50,2.00,3.38,3.75,2.50,2.50,3.75,4.00
9,11-3031.01,3.75,4.25,3.88,4.25,4.00,4.12,3.75,3.75,4.12,...,3.38,4.12,1.50,0.50,3.50,4.00,1.00,0.00,3.62,4.00


In [86]:
# WA['Element_pivot'] =  WA['Element Name'] + " " +  WA['Scale ID']
# WA_pivot = WA.pivot(index = 'O*NET-SOC Code', columns='Element_pivot', values='Data Value').fillna(0)
# WA_pivot.reset_index(inplace=True)

# WA_pivot

# occupation_level_skills_wa = pd.merge(df_skills, WA_pivot,  how='left', left_on=['O*NET-SOC Code'], right_on = ['O*NET-SOC Code'])
# occupation_level_skills_wa = occupation_level_skills_wa.sort_values(by = 'Observed Occupation')
# occupation_level_skills_wa


# Skills by Occupations and Tasks 

In [108]:
skills_by_occuTasks = pd.merge(df3, skills_pivot,  how='left', left_on=['O*NET-SOC Code'], right_on = ['O*NET-SOC Code'])
skills_by_occuTasks.head()

Unnamed: 0,O*NET-SOC Code,Task ID,DWA ID,IWA ID,WA ID,Active Learning IM,Active Learning LV,Active Listening IM,Active Listening LV,Complex Problem Solving IM,...,Systems Evaluation IM,Systems Evaluation LV,Technology Design IM,Technology Design LV,Time Management IM,Time Management LV,Troubleshooting IM,Troubleshooting LV,Writing IM,Writing LV
0,11-1011.00,8823,4.A.4.b.4.I09.D02,4.A.4.b.4.I09,4.A.4.b.4,4.0,4.75,4.12,4.88,4.38,...,4.12,5.12,1.75,0.75,4.0,4.75,1.0,0.0,4.0,4.38
1,11-1011.00,8824,4.A.4.a.2.I03.D14,4.A.4.a.2.I03,4.A.4.a.2,4.0,4.75,4.12,4.88,4.38,...,4.12,5.12,1.75,0.75,4.0,4.75,1.0,0.0,4.0,4.38
2,11-1011.00,8825,4.A.2.a.4.I07.D09,4.A.2.a.4.I07,4.A.2.a.4,4.0,4.75,4.12,4.88,4.38,...,4.12,5.12,1.75,0.75,4.0,4.75,1.0,0.0,4.0,4.38
3,11-1011.00,8826,4.A.2.b.1.I09.D01,4.A.2.b.1.I09,4.A.2.b.1,4.0,4.75,4.12,4.88,4.38,...,4.12,5.12,1.75,0.75,4.0,4.75,1.0,0.0,4.0,4.38
4,11-1011.00,8826,4.A.2.b.4.I01.D01,4.A.2.b.4.I01,4.A.2.b.4,4.0,4.75,4.12,4.88,4.38,...,4.12,5.12,1.75,0.75,4.0,4.75,1.0,0.0,4.0,4.38


## Weight Skills by Importance of Task and Frequency of Task

In [200]:
#Importance:
task_rates = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Task Ratings.txt'), sep='\t')
task_im = task_rates[task_rates['Scale ID'] == 'IM']
task_im.rename(columns = {'Data Value':'Task IM'}, inplace = True)
                                               
cols = ['O*NET-SOC Code', 'Task ID', 'Task IM']
task_im = task_im[cols]
task_im

Unnamed: 0,O*NET-SOC Code,Task ID,Task IM
7,11-1011.00,8823,4.54
16,11-1011.00,8831,4.48
25,11-1011.00,8825,4.40
34,11-1011.00,8826,4.39
43,11-1011.00,8827,4.17
52,11-1011.00,8824,4.15
61,11-1011.00,8836,4.12
70,11-1011.00,8832,4.02
79,11-1011.00,8835,3.96
88,11-1011.00,8833,3.96


In [201]:
# task_cats = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Task Categories.txt'), sep='\t')
task_freq = task_rates[task_rates.loc[:,('Scale ID')] == 'FT']

# Manually change Frequency Categories into Numeric value per Day
time_categories = {'1' : (1/365.),  # Yearly or less
                   '2' : (4/365.),  # More than yearly
                   '3' : (3/12.),   # More than monthly
                   '4' : (3/7.),    # More than weekly
                   '5' : 1.,            # Daily
                   '6' : 3.,            # Several times daily
                   '7' : 8.}            # Hourly or more

task_freq.loc[:,('Temp')] = [float(time_categories[i]) for i in task_freq.loc[:,('Category')].values]
task_freq.loc[:,('Freq')] = task_freq.loc[:,('Temp')]*task_freq.loc[:,('Data Value')]
task_freq.head()

Unnamed: 0,O*NET-SOC Code,Task ID,Scale ID,Category,Data Value,N,Standard Error,Lower CI Bound,Upper CI Bound,Recommend Suppress,Date,Domain Source,Temp,Freq
0,11-1011.00,8823,FT,1,4.34,79,2.48,1.36,12.96,N,07/2014,Incumbent,0.00274,0.01189
1,11-1011.00,8823,FT,2,9.16,79,3.86,3.86,20.24,N,07/2014,Incumbent,0.010959,0.100384
2,11-1011.00,8823,FT,3,11.04,79,3.44,5.82,19.95,N,07/2014,Incumbent,0.25,2.76
3,11-1011.00,8823,FT,4,16.19,79,4.37,9.24,26.83,N,07/2014,Incumbent,0.428571,6.938571
4,11-1011.00,8823,FT,5,46.67,79,6.03,35.07,58.64,N,07/2014,Incumbent,1.0,46.67


In [185]:
(0.011890 + 0.100384 + 2.760000 + 6.938571 + 46.670000 + 21.990000 + 42.080000) / 7.

17.221549285714286

In [241]:
by = ['O*NET-SOC Code', 'Task ID']
task_freq = task_freq.groupby(by).mean().reset_index()
task_freq.rename(columns = {'Freq':'Task Freq'}, inplace = True)
cols = by + ['Task Freq']
task_freq = task_freq[cols]

task_freq_norm = task_freq.groupby('O*NET-SOC Code').sum().reset_index()
task_freq_norm.rename(columns = {'Task Freq': 'Sum per Occu'}, inplace= True)
task_freq_norm = task_freq_norm[['O*NET-SOC Code', 'Sum per Occu']]

task_freqs = pd.merge(task_freq, task_freq_norm,  how='left', left_on=['O*NET-SOC Code'], right_on = ['O*NET-SOC Code'])
task_freqs.loc[:, ('Task Freq Norm')] = task_freqs.loc[:, ('Task Freq')] / task_freqs.loc[:, ('Sum per Occu')]
# task_freqs.groupby('O*NET-SOC Code').sum() # check they sum to 1 :) 

task_freqs

Unnamed: 0,O*NET-SOC Code,Task ID,Task Freq,Sum per Occu,Task Freq Norm
0,11-1011.00,8823,17.221549,239.560026,0.071888
1,11-1011.00,8824,17.148012,239.560026,0.071581
2,11-1011.00,8825,15.445267,239.560026,0.064473
3,11-1011.00,8826,18.354019,239.560026,0.076616
4,11-1011.00,8827,3.221570,239.560026,0.013448
5,11-1011.00,8828,34.402920,239.560026,0.143609
6,11-1011.00,8829,6.470654,239.560026,0.027011
7,11-1011.00,8830,8.451864,239.560026,0.035281
8,11-1011.00,8831,12.226429,239.560026,0.051037
9,11-1011.00,8832,11.255931,239.560026,0.046986


In [244]:
# Weight the skills vectors by their Task Importances and Frequencies
skill_columns = skills_by_occuTasks.columns[5:]

# for skill in skill_columns:
#     print skills_by_occuTasks[skill]
    
skills_by_occuTasks.shape

# Need to have a defensible method for weighting Importance and Frequency and creati

(19125, 5)