Map ONET Skills onto Tasks via Occupations Dataset
---

By Paul Duckworth 17th Nov 2017.

Create a skills/abilies vector per (DWA) task from ONET datasets: Skills, Abilities, Occupations, Tasks, DWAs


In [1]:
import os
import numpy as np
import pandas as pd
import pandas_ml as pdml
import getpass
import matplotlib.pyplot as plt
from random import shuffle
%matplotlib inline

datasets = '/home/'+ getpass.getuser() +'/Datasets/'
print datasets


/home/scpd/Datasets/


# ONET Datasets:

## Occupations and Tasks 

In [246]:
occupations = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Occupation Data.txt'), sep='\t')
occupations.rename(columns = {'Title':'Occupation title'}, inplace = True)
occupations.head()

Unnamed: 0,O*NET-SOC Code,Occupation title,Description
0,11-1011.00,Chief Executives,Determine and formulate policies and provide o...
1,11-1011.03,Chief Sustainability Officers,"Communicate and coordinate with management, sh..."
2,11-1021.00,General and Operations Managers,"Plan, direct, or coordinate the operations of ..."
3,11-1031.00,Legislators,"Develop, introduce or enact laws and statutes ..."
4,11-2011.00,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici..."


In [566]:
tasks = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Task Statements.txt'), sep='\t')
tasks = tasks[['O*NET-SOC Code', 'Task ID', 'Task']]

reduce_tasks = tasks['O*NET-SOC Code'].unique()#[:2]
tasks = tasks[tasks['O*NET-SOC Code'].isin(reduce_tasks)]      # reduce the task matrix for now :)
print tasks.shape, "UNIQUE tasks: ", len(tasks['Task ID'].unique())  # All tasks are unique to Occupation it seems
tasks.head()

(19566, 3) UNIQUE tasks:  19566


Unnamed: 0,O*NET-SOC Code,Task ID,Task
0,11-1011.00,8823,Direct or coordinate an organization's financi...
1,11-1011.00,8831,Appoint department heads or managers and assig...
2,11-1011.00,8825,Analyze operations to evaluate performance of ...
3,11-1011.00,8826,"Direct, plan, or implement policies, objective..."
4,11-1011.00,8827,"Prepare budgets for approval, including those ..."


In [567]:
#Task DWAs (detailed work activitiy code):
taskDWA = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Tasks to DWAs.txt'), sep='\t')
taskDWA = taskDWA[['Task ID', 'DWA ID']]

print taskDWA.shape, "UNIQUE DWA: ", len(taskDWA['DWA ID'].unique()) 
taskDWA.head()

(22838, 2) UNIQUE DWA:  2070


Unnamed: 0,Task ID,DWA ID
0,20461,4.A.2.a.4.I09.D03
1,20461,4.A.4.b.6.I08.D04
2,8823,4.A.4.b.4.I09.D02
3,8824,4.A.4.a.2.I03.D14
4,8825,4.A.2.a.4.I07.D09


In [568]:
df = pd.merge(tasks, taskDWA,  how='left', left_on=['Task ID'], right_on = ['Task ID']).sort_values(by = 'Task ID')
df = df[df['DWA ID'].notnull()]
df['IWA ID'] = df['DWA ID'].str.slice(0,-4)    # create IWA ID
df['WA ID'] = df['DWA ID'].str.slice(0,-8)     # create WA ID

## ADD DWA and IWA titles:
DWAref = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/DWA Reference.txt'), sep='\t')[['DWA ID', 'DWA Title']]
df2 = pd.merge(df, DWAref,  how='left', left_on=['DWA ID'], right_on = ['DWA ID'])

IWAref = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/IWA Reference.txt'), sep='\t')[['IWA ID', 'IWA Title']]
df3 = pd.merge(df2, IWAref,  how='left', left_on=['IWA ID'], right_on = ['IWA ID'])

# cols = ['Task ID', 'Task', 'DWA ID', 'DWA Title', 'IWA ID', 'IWA Title', 'WA ID']
cols = ['O*NET-SOC Code', 'Task ID', 'DWA ID', 'IWA ID', 'WA ID']
df3 = df3[cols]

# tasks are many-to-many with DWA, e.g. task id=8826.
print df3.shape, "UNIQUE DWA: ", len(df3['DWA ID'].unique())
df3.head()

(22838, 5) UNIQUE DWA:  2070


Unnamed: 0,O*NET-SOC Code,Task ID,DWA ID,IWA ID,WA ID
0,11-2022.00,1,4.A.4.a.8.I03.D05,4.A.4.a.8.I03,4.A.4.a.8
1,11-2022.00,2,4.A.1.a.1.I14.D04,4.A.1.a.1.I14,4.A.1.a.1
2,11-2022.00,3,4.A.4.b.4.I12.D03,4.A.4.b.4.I12,4.A.4.b.4
3,11-2022.00,4,4.A.2.b.4.I01.D06,4.A.2.b.4.I01,4.A.2.b.4
4,11-2022.00,5,4.A.2.a.4.I11.D06,4.A.2.a.4.I11,4.A.2.a.4


## Skills by Occupations

In [569]:
skills = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Skills.txt'), sep='\t', low_memory=False)
skills.head()
skills = skills[skills['Scale ID'] == 'IM']

skills['Element_pivot'] =  skills['Element Name'] + " " +  skills['Scale ID']

skills_pivot = skills.pivot(index = 'O*NET-SOC Code', columns='Element_pivot', values='Data Value').fillna(0)
skills_pivot.reset_index(inplace=True)
skills_pivot


Element_pivot,O*NET-SOC Code,Active Learning IM,Active Listening IM,Complex Problem Solving IM,Coordination IM,Critical Thinking IM,Equipment Maintenance IM,Equipment Selection IM,Installation IM,Instructing IM,...,Science IM,Service Orientation IM,Social Perceptiveness IM,Speaking IM,Systems Analysis IM,Systems Evaluation IM,Technology Design IM,Time Management IM,Troubleshooting IM,Writing IM
0,11-1011.00,4.00,4.12,4.38,4.25,4.38,1.00,1.12,1.00,3.12,...,1.88,3.12,4.25,4.38,4.12,4.12,1.75,4.00,1.00,4.00
1,11-1011.03,3.50,3.88,4.00,3.62,4.00,1.00,1.12,1.00,3.25,...,1.75,3.25,3.75,4.00,3.62,3.62,1.62,3.38,1.12,3.88
2,11-1021.00,3.50,4.00,3.50,4.00,3.88,1.00,1.00,1.00,3.12,...,1.88,3.25,4.00,4.00,3.00,3.00,1.88,3.75,2.00,3.25
3,11-2011.00,3.25,4.00,3.50,3.50,3.75,1.00,1.25,1.00,2.88,...,1.50,3.12,4.00,4.00,3.12,3.00,1.62,3.88,1.12,3.75
4,11-2021.00,3.88,3.88,3.62,3.50,3.88,1.00,1.00,1.00,3.00,...,1.75,3.12,3.88,3.88,3.25,3.50,1.75,3.50,1.00,3.25
5,11-2022.00,3.75,4.00,3.75,3.88,3.88,1.00,1.00,1.00,3.62,...,1.62,3.88,3.88,4.00,3.62,3.62,1.75,3.50,1.00,3.62
6,11-2031.00,3.25,4.25,3.62,3.75,3.75,1.00,1.00,1.00,3.00,...,1.50,3.38,4.12,4.38,3.50,3.50,1.50,3.62,1.00,4.12
7,11-3011.00,3.12,3.88,3.12,3.75,3.50,1.00,1.00,1.00,3.12,...,1.25,3.25,3.38,4.00,3.12,2.75,1.38,3.62,1.75,3.62
8,11-3021.00,3.38,4.00,3.75,3.75,4.12,1.38,1.88,1.00,3.12,...,1.75,2.88,3.62,3.88,3.62,3.62,2.50,3.38,2.50,3.75
9,11-3031.01,3.75,3.88,4.00,3.75,4.12,1.00,1.00,1.00,2.88,...,1.50,2.88,3.25,4.00,3.38,3.38,1.50,3.50,1.00,3.62


In [86]:
# WA['Element_pivot'] =  WA['Element Name'] + " " +  WA['Scale ID']
# WA_pivot = WA.pivot(index = 'O*NET-SOC Code', columns='Element_pivot', values='Data Value').fillna(0)
# WA_pivot.reset_index(inplace=True)

# WA_pivot

# occupation_level_skills_wa = pd.merge(df_skills, WA_pivot,  how='left', left_on=['O*NET-SOC Code'], right_on = ['O*NET-SOC Code'])
# occupation_level_skills_wa = occupation_level_skills_wa.sort_values(by = 'Observed Occupation')
# occupation_level_skills_wa


# Skills by Occupations and Tasks 

In [570]:
# don't need to merge this: 
# skills_by_occuTasks = pd.merge(df3, skills_pivot,  how='left', left_on=['O*NET-SOC Code'], right_on = ['O*NET-SOC Code'])

#access the occupation skills vector like this:
skills_pivot[skills_pivot['O*NET-SOC Code'] == '11-1011.00'].values[0][1:]

array([4.0, 4.12, 4.38, 4.25, 4.38, 1.0, 1.12, 1.0, 3.12, 4.5, 3.12, 4.12,
       3.88, 4.25, 3.25, 4.12, 4.12, 1.88, 1.75, 3.12, 4.12, 1.62, 1.75,
       4.12, 1.0, 1.88, 3.12, 4.25, 4.38, 4.12, 4.12, 1.75, 4.0, 1.0, 4.0], dtype=object)

## Weight Skills by Importance of Task and Frequency of Task

In [571]:
#Importance:
task_rates = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Task Ratings.txt'), sep='\t')
task_im = task_rates[task_rates['Scale ID'] == 'IM']
task_im.rename(columns = {'Data Value':'Task IM'}, inplace = True)

cols = ['O*NET-SOC Code', 'Task ID', 'Task IM']
by = ['O*NET-SOC Code', 'Task ID']
task_im = task_im[cols].sort_values(by)
task_im.head()


Unnamed: 0,O*NET-SOC Code,Task ID,Task IM
7,11-1011.00,8823,4.54
52,11-1011.00,8824,4.15
25,11-1011.00,8825,4.4
34,11-1011.00,8826,4.39
43,11-1011.00,8827,4.17


In [573]:
task_im_sum = task_im.groupby('O*NET-SOC Code').sum().reset_index()
task_im_sum.rename(columns = {'Task IM': 'IM Sum per Occu'}, inplace= True)
task_im_sum = task_im_sum[['O*NET-SOC Code', 'IM Sum per Occu']]

task_ims = pd.merge(task_im, task_im_sum,  how='left', left_on=['O*NET-SOC Code'], right_on = ['O*NET-SOC Code'])
task_ims['Task IM Norm'] = task_ims['Task IM'] / task_ims['IM Sum per Occu'] 
task_ims

Unnamed: 0,O*NET-SOC Code,Task ID,Task IM,IM Sum per Occu,Task IM Norm
0,11-1011.00,8823,4.54,118.75,0.038232
1,11-1011.00,8824,4.15,118.75,0.034947
2,11-1011.00,8825,4.40,118.75,0.037053
3,11-1011.00,8826,4.39,118.75,0.036968
4,11-1011.00,8827,4.17,118.75,0.035116
5,11-1011.00,8828,4.25,118.75,0.035789
6,11-1011.00,8829,3.95,118.75,0.033263
7,11-1011.00,8830,3.82,118.75,0.032168
8,11-1011.00,8831,4.48,118.75,0.037726
9,11-1011.00,8832,4.02,118.75,0.033853


In [574]:
# task_cats = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Task Categories.txt'), sep='\t')
task_freq = task_rates[task_rates.loc[:,('Scale ID')] == 'FT']

# Manually change Frequency Categories into Numeric value per Day  
time_categories = {'1' : (1/365.),  # Yearly or less
                   '2' : (4/365.),  # More than yearly
                   '3' : (3/12.),   # More than monthly
                   '4' : (3/7.),    # More than weekly
                   '5' : 1.,            # Daily
                   '6' : 3.,            # Several times daily
                   '7' : 8.}            # Hourly or more

# only used relative to each other to calculate relative frequency

task_freq.loc[:,('Temp')] = [float(time_categories[i]) for i in task_freq.loc[:,('Category')].values]
task_freq.loc[:,('Freq')] = task_freq.loc[:,('Temp')]*task_freq.loc[:,('Data Value')]
task_freq.head()

Unnamed: 0,O*NET-SOC Code,Task ID,Scale ID,Category,Data Value,N,Standard Error,Lower CI Bound,Upper CI Bound,Recommend Suppress,Date,Domain Source,Temp,Freq
0,11-1011.00,8823,FT,1,4.34,79,2.48,1.36,12.96,N,07/2014,Incumbent,0.00274,0.01189
1,11-1011.00,8823,FT,2,9.16,79,3.86,3.86,20.24,N,07/2014,Incumbent,0.010959,0.100384
2,11-1011.00,8823,FT,3,11.04,79,3.44,5.82,19.95,N,07/2014,Incumbent,0.25,2.76
3,11-1011.00,8823,FT,4,16.19,79,4.37,9.24,26.83,N,07/2014,Incumbent,0.428571,6.938571
4,11-1011.00,8823,FT,5,46.67,79,6.03,35.07,58.64,N,07/2014,Incumbent,1.0,46.67


In [575]:
task_freq = task_freq.groupby(by).mean().reset_index()
task_freq.rename(columns = {'Freq':'Task Freq'}, inplace = True)
cols = by + ['Task Freq']
task_freq = task_freq[cols]

task_freq_norm = task_freq.groupby('O*NET-SOC Code').sum().reset_index()
task_freq_norm.rename(columns = {'Task Freq': 'Sum per Occu'}, inplace= True)
task_freq_norm = task_freq_norm[['O*NET-SOC Code', 'Sum per Occu']]

task_freqs = pd.merge(task_freq, task_freq_norm,  how='left', left_on=['O*NET-SOC Code'], right_on = ['O*NET-SOC Code'])
task_freqs.loc[:, ('Task Freq Norm')] = task_freqs.loc[:, ('Task Freq')] / task_freqs.loc[:, ('Sum per Occu')]
# task_freqs.groupby('O*NET-SOC Code').sum() # check they sum to 1 :) 

task_freqs[['O*NET-SOC Code', 'Task ID', 'Task Freq', 'Task Freq Norm']]

Unnamed: 0,O*NET-SOC Code,Task ID,Task Freq,Task Freq Norm
0,11-1011.00,8823,17.221549,0.071888
1,11-1011.00,8824,17.148012,0.071581
2,11-1011.00,8825,15.445267,0.064473
3,11-1011.00,8826,18.354019,0.076616
4,11-1011.00,8827,3.221570,0.013448
5,11-1011.00,8828,34.402920,0.143609
6,11-1011.00,8829,6.470654,0.027011
7,11-1011.00,8830,8.451864,0.035281
8,11-1011.00,8831,12.226429,0.051037
9,11-1011.00,8832,11.255931,0.046986


In [644]:
df8 = df3[ df3['O*NET-SOC Code'].isin(['11-1031.00','11-1011.00', '15-2091.00'])]
task_ims[ task_ims['O*NET-SOC Code'].isin([])]

task_freq['O*NET-SOC Code'].str.contains('15-2091.00').any() and task_ims['O*NET-SOC Code'].str.contains('15-2091.00').any()
    
# if skills_pivot[skills_pivot['O*NET-SOC Code'] == '11-1031.00'].shape[0] == 0:
#     print "h"
#     pass


{'IM/Freq': set(), 'skill_vec': set()}

In [647]:
# A defensible method for weighting Importance and Frequency of Skills vector per Occu 

weighted_vecs = np.array([])
weighted_vecs.shape
problem_onet_codes = {"skill_vec":set([]), "IM/Freq" : set([]) }

for (cnt, row) in df3.iterrows():
    if cnt%1000 ==0: print cnt,
    
    #if task_freq['O*NET-SOC Code'].str.contains(row['O*NET-SOC Code']).any() and task_ims['O*NET-SOC Code'].str.contains(row['O*NET-SOC Code']).any() and skills_pivot['O*NET-SOC Code'].str.contains(row['O*NET-SOC Code']).any():
       
    try: 
        skill_vec = skills_pivot[skills_pivot['O*NET-SOC Code'] == row['O*NET-SOC Code']].values[0][1:]
    except IndexError as e:
#         print cnt, row['O*NET-SOC Code'], skills_pivot[skills_pivot['O*NET-SOC Code'] == row['O*NET-SOC Code']].values
        problem_onet_codes["skill_vec"].add(row['O*NET-SOC Code'])
        continue

    freq = task_freqs[task_freqs['Task ID'] == row['Task ID']]['Task Freq Norm'].values
    imp =  task_ims[task_ims['Task ID'] == row['Task ID']]['Task IM Norm'].values

    try:
        vec = skill_vec*freq*imp
    except ValueError as e:
#         print row['O*NET-SOC Code']
        problem_onet_codes["IM/Freq"].add(row['O*NET-SOC Code'])
        continue

    task_info = np.append(row['Task ID'], row['DWA ID'])
    weigted_skills = np.append(task_info, vec)

    try: 
        weighted_vecs = np.append(weighted_vecs, [weigted_skills], axis=0)
    except:
        weighted_vecs = np.array([weigted_skills])

weighted_vecs.shape

0 1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 13000 14000 15000 16000 17000 18000 19000 20000 21000 22000

(22365, 37)




In [648]:
problem_onet_codes

{'IM/Freq': {'11-3071.02',
  '11-9199.01',
  '11-9199.02',
  '11-9199.04',
  '13-1041.07',
  '13-2099.02',
  '15-1143.00',
  '15-1199.05',
  '15-2091.00',
  '17-2011.00',
  '17-2051.01',
  '17-2161.00',
  '17-2199.02',
  '17-2199.04',
  '17-2199.05',
  '17-2199.06',
  '17-2199.07',
  '17-2199.08',
  '17-3024.01',
  '17-3029.02',
  '17-3029.03',
  '17-3029.04',
  '17-3029.05',
  '17-3029.06',
  '17-3029.07',
  '17-3029.09',
  '19-2042.00',
  '19-3099.01',
  '19-4041.01',
  '19-4051.01',
  '19-4099.03',
  '41-3031.03',
  '41-3099.01',
  '43-5011.01',
  '47-2061.00',
  '49-3023.02',
  '51-9061.00',
  '51-9151.00'},
 'skill_vec': {'11-1031.00',
  '11-2011.01',
  '11-3051.05',
  '13-2099.01',
  '13-2099.03',
  '15-1199.07',
  '17-3029.10',
  '17-3029.11',
  '25-2051.00',
  '51-8099.02'}}

In [649]:
df4 = pd.DataFrame(data=weighted_vecs)
task_cols = ['Task ID', 'DWA ID']

skill_column_names = list(skills_pivot.columns[1:])
df4.columns = np.append(task_cols, skill_column_names)
skill_column_names
df4[skill_column_names] = df4[skill_column_names].apply(pd.to_numeric)
df4.head()

Unnamed: 0,Task ID,DWA ID,Active Learning IM,Active Listening IM,Complex Problem Solving IM,Coordination IM,Critical Thinking IM,Equipment Maintenance IM,Equipment Selection IM,Installation IM,...,Science IM,Service Orientation IM,Social Perceptiveness IM,Speaking IM,Systems Analysis IM,Systems Evaluation IM,Technology Design IM,Time Management IM,Troubleshooting IM,Writing IM
0,1,4.A.4.a.8.I03.D05,0.014456,0.01542,0.014456,0.014957,0.014957,0.003855,0.003855,0.003855,...,0.006245,0.014957,0.014957,0.01542,0.013955,0.013955,0.006746,0.013492,0.003855,0.013955
1,2,4.A.1.a.1.I14.D04,0.013251,0.014134,0.013251,0.01371,0.01371,0.003534,0.003534,0.003534,...,0.005724,0.01371,0.01371,0.014134,0.012791,0.012791,0.006184,0.012367,0.003534,0.012791
2,3,4.A.4.b.4.I12.D03,0.068894,0.073487,0.068894,0.071282,0.071282,0.018372,0.018372,0.018372,...,0.029762,0.071282,0.071282,0.073487,0.066505,0.066505,0.03215,0.064301,0.018372,0.066505
3,4,4.A.2.b.4.I01.D06,0.009293,0.009913,0.009293,0.009616,0.009616,0.002478,0.002478,0.002478,...,0.004015,0.009616,0.009616,0.009913,0.008971,0.008971,0.004337,0.008674,0.002478,0.008971
4,5,4.A.2.a.4.I11.D06,0.013319,0.014207,0.013319,0.013781,0.013781,0.003552,0.003552,0.003552,...,0.005754,0.013781,0.013781,0.014207,0.012857,0.012857,0.006216,0.012431,0.003552,0.012857


In [650]:
# Take the mean average over occupations DWAs
df5 = df4.groupby(['DWA ID'], as_index=False).mean()
df5.head()

Unnamed: 0,DWA ID,Active Learning IM,Active Listening IM,Complex Problem Solving IM,Coordination IM,Critical Thinking IM,Equipment Maintenance IM,Equipment Selection IM,Installation IM,Instructing IM,...,Science IM,Service Orientation IM,Social Perceptiveness IM,Speaking IM,Systems Analysis IM,Systems Evaluation IM,Technology Design IM,Time Management IM,Troubleshooting IM,Writing IM
0,4.A.1.a.1.I01.D01,0.010513,0.012409,0.010969,0.010754,0.011459,0.004558,0.005109,0.00332,0.009242,...,0.004461,0.009614,0.01055,0.012024,0.009141,0.008584,0.006075,0.011338,0.005037,0.010107
1,4.A.1.a.1.I01.D02,0.009477,0.01059,0.00951,0.010108,0.010157,0.003004,0.003264,0.003017,0.010441,...,0.003917,0.009089,0.009986,0.009903,0.008578,0.009,0.004803,0.009341,0.003583,0.009081
2,4.A.1.a.1.I01.D03,0.003006,0.003326,0.00286,0.003215,0.003174,0.00103,0.001808,0.00103,0.002679,...,0.00146,0.002744,0.003091,0.003438,0.002511,0.002452,0.001669,0.003152,0.001781,0.002903
3,4.A.1.a.1.I01.D04,0.012529,0.015463,0.012163,0.01374,0.013939,0.004233,0.004317,0.004007,0.011859,...,0.005774,0.010475,0.014623,0.015724,0.008889,0.008774,0.005453,0.013552,0.004962,0.012234
4,4.A.1.a.1.I02.D01,0.010916,0.014029,0.010733,0.011965,0.012673,0.00392,0.004245,0.00392,0.008889,...,0.004969,0.012641,0.012358,0.013255,0.009056,0.008915,0.00587,0.012348,0.004303,0.013704


# GP This dataset using all skills