Map ONET Skills onto Tasks via Occupations Dataset
---

By Paul Duckworth 17th Nov 2017.

Create a skills/abilies vector per (DWA) task from ONET datasets: Skills, Abilities, Occupations, Tasks, DWAs


In [1]:
import os
import numpy as np
import pandas as pd
import getpass
import cPickle as pickle
from random import shuffle
%matplotlib inline

# Point this at your ONET data: 
datasets = '/home/'+ getpass.getuser() +'/Datasets/'
print datasets

survey_data_ = pd.read_csv(os.path.join(datasets, 'FoEmployment/fow-expert-survey/data/cleaned/counts_data_with_metadata.csv'))


/home/scpd/Datasets/


# ONET Datasets:

## Occupations and Tasks 

In [2]:
occupations = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Occupation Data.txt'), sep='\t')
occupations.rename(columns = {'Title':'Occupation title'}, inplace = True)
occupations.head()

Unnamed: 0,O*NET-SOC Code,Occupation title,Description
0,11-1011.00,Chief Executives,Determine and formulate policies and provide o...
1,11-1011.03,Chief Sustainability Officers,"Communicate and coordinate with management, sh..."
2,11-1021.00,General and Operations Managers,"Plan, direct, or coordinate the operations of ..."
3,11-1031.00,Legislators,"Develop, introduce or enact laws and statutes ..."
4,11-2011.00,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici..."


In [3]:
tasks = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Task Statements.txt'), sep='\t')
tasks = tasks[['O*NET-SOC Code', 'Task ID', 'Task']]

reduce_tasks = tasks['O*NET-SOC Code'].unique()#[:2]
tasks = tasks[tasks['O*NET-SOC Code'].isin(reduce_tasks)]      # reduce the task matrix for now :)
print tasks.shape, "UNIQUE tasks: ", len(tasks['Task ID'].unique())  # All tasks are unique to Occupation it seems
tasks.head()

(19566, 3) UNIQUE tasks:  19566


Unnamed: 0,O*NET-SOC Code,Task ID,Task
0,11-1011.00,8823,Direct or coordinate an organization's financi...
1,11-1011.00,8831,Appoint department heads or managers and assig...
2,11-1011.00,8825,Analyze operations to evaluate performance of ...
3,11-1011.00,8826,"Direct, plan, or implement policies, objective..."
4,11-1011.00,8827,"Prepare budgets for approval, including those ..."


In [50]:
#Task DWAs (detailed work activitiy code):
taskDWA = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Tasks to DWAs.txt'), sep='\t')
taskDWA = taskDWA[['Task ID', 'DWA ID']]

print taskDWA.shape, "UNIQUE DWA: ", len(taskDWA['DWA ID'].unique()) 
taskDWA.head()

(22838, 2) UNIQUE DWA:  2070


Unnamed: 0,Task ID,DWA ID
0,20461,4.A.2.a.4.I09.D03
1,20461,4.A.4.b.6.I08.D04
2,8823,4.A.4.b.4.I09.D02
3,8824,4.A.4.a.2.I03.D14
4,8825,4.A.2.a.4.I07.D09


In [51]:
df = pd.merge(tasks, taskDWA,  how='left', left_on=['Task ID'], right_on = ['Task ID']).sort_values(by = 'Task ID')
df = df[df['DWA ID'].notnull()]
df['IWA ID'] = df['DWA ID'].str.slice(0,-4)    # create IWA ID
df['WA ID'] = df['DWA ID'].str.slice(0,-8)     # create WA ID

## ADD DWA and IWA titles:
DWAref = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/DWA Reference.txt'), sep='\t')[['DWA ID', 'DWA Title']]
taskDWA2 = pd.merge(df, DWAref,  how='left', left_on=['DWA ID'], right_on = ['DWA ID'])

IWAref = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/IWA Reference.txt'), sep='\t')[['IWA ID', 'IWA Title']]
df3 = pd.merge(df2, IWAref,  how='left', left_on=['IWA ID'], right_on = ['IWA ID'])

# cols = ['Task ID', 'Task', 'DWA ID', 'DWA Title', 'IWA ID', 'IWA Title', 'WA ID']
cols = ['O*NET-SOC Code', 'Task ID', 'DWA ID', 'IWA ID', 'WA ID']
df3 = df3[cols]

# tasks are many-to-many with DWA, e.g. task id=8826.
print df3.shape, "UNIQUE DWA: ", len(df3['DWA ID'].unique())
df3.head()

(22838, 5) UNIQUE DWA:  2070


Unnamed: 0,O*NET-SOC Code,Task ID,DWA ID,IWA ID,WA ID
0,11-2022.00,1,4.A.4.a.8.I03.D05,4.A.4.a.8.I03,4.A.4.a.8
1,11-2022.00,2,4.A.1.a.1.I14.D04,4.A.1.a.1.I14,4.A.1.a.1
2,11-2022.00,3,4.A.4.b.4.I12.D03,4.A.4.b.4.I12,4.A.4.b.4
3,11-2022.00,4,4.A.2.b.4.I01.D06,4.A.2.b.4.I01,4.A.2.b.4
4,11-2022.00,5,4.A.2.a.4.I11.D06,4.A.2.a.4.I11,4.A.2.a.4


In [156]:
# Task Importance: (Each task is unique to it's occupation)
task_rates = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Task Ratings.txt'), sep='\t')
task_im = task_rates[task_rates['Scale ID'] == 'IM']
task_im.rename(columns = {'Data Value':'Task IM'}, inplace = True)

task_im = task_im[['Task ID', 'Task IM']].sort_values('Task ID')
print task_im.shape

df4 = pd.merge(df3, task_im, how='left', left_on=['Task ID'], right_on = ['Task ID'])
print df4.shape

# Remove Task if no Task Importance: 
df4 = df4[df4['Task IM'].notnull()]   # 500 missing Task IM 

by = ['DWA ID']
task_im_by_dwa = df4.groupby(by).sum().reset_index()[['DWA ID','Task IM']]
task_im_by_dwa.rename(columns = {'Task IM' : 'IM per DWA Sum'}, inplace = True)

df5 = pd.merge(df4, task_im_by_dwa,  how='left', left_on=['DWA ID'], right_on = ['DWA ID'])
df5['Task IM Norm'] = df5['Task IM'] / df5['IM per DWA Sum']
df5.head()

(19125, 2)
(22838, 6)


Unnamed: 0,O*NET-SOC Code,Task ID,DWA ID,IWA ID,WA ID,Task IM,IM per DWA Sum,Task IM Norm
0,11-2022.00,1,4.A.4.a.8.I03.D05,4.A.4.a.8.I03,4.A.4.a.8,4.09,84.74,0.048265
1,11-2022.00,2,4.A.1.a.1.I14.D04,4.A.1.a.1.I14,4.A.1.a.1,3.57,33.92,0.105248
2,11-2022.00,3,4.A.4.b.4.I12.D03,4.A.4.b.4.I12,4.A.4.b.4,4.22,56.68,0.074453
3,11-2022.00,4,4.A.2.b.4.I01.D06,4.A.2.b.4.I01,4.A.2.b.4,3.61,28.76,0.125522
4,11-2022.00,5,4.A.2.a.4.I11.D06,4.A.2.a.4.I11,4.A.2.a.4,3.95,17.99,0.219566


## Skills by Occupations

In [332]:
skills = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Skills.txt'), sep='\t', low_memory=False)
skills.head()
skills = skills[skills['Scale ID'] == 'IM']
# skills['Element_pivot'] =  skills['Element Name'] + " " +  skills['Scale ID']
skills_pivot = skills.pivot(index = 'O*NET-SOC Code', columns='Element Name', values='Data Value').fillna(0)
skills_pivot.reset_index(inplace=True)
print skills_pivot.shape
skills_pivot.head()

(964, 36)


Element Name,O*NET-SOC Code,Active Learning,Active Listening,Complex Problem Solving,Coordination,Critical Thinking,Equipment Maintenance,Equipment Selection,Installation,Instructing,...,Science,Service Orientation,Social Perceptiveness,Speaking,Systems Analysis,Systems Evaluation,Technology Design,Time Management,Troubleshooting,Writing
0,11-1011.00,4.0,4.12,4.38,4.25,4.38,1.0,1.12,1.0,3.12,...,1.88,3.12,4.25,4.38,4.12,4.12,1.75,4.0,1.0,4.0
1,11-1011.03,3.5,3.88,4.0,3.62,4.0,1.0,1.12,1.0,3.25,...,1.75,3.25,3.75,4.0,3.62,3.62,1.62,3.38,1.12,3.88
2,11-1021.00,3.5,4.0,3.5,4.0,3.88,1.0,1.0,1.0,3.12,...,1.88,3.25,4.0,4.0,3.0,3.0,1.88,3.75,2.0,3.25
3,11-2011.00,3.25,4.0,3.5,3.5,3.75,1.0,1.25,1.0,2.88,...,1.5,3.12,4.0,4.0,3.12,3.0,1.62,3.88,1.12,3.75
4,11-2021.00,3.88,3.88,3.62,3.5,3.88,1.0,1.0,1.0,3.0,...,1.75,3.12,3.88,3.88,3.25,3.5,1.75,3.5,1.0,3.25


In [7]:
# WA['Element_pivot'] =  WA['Element Name'] + " " +  WA['Scale ID']
# WA_pivot = WA.pivot(index = 'O*NET-SOC Code', columns='Element_pivot', values='Data Value').fillna(0)
# WA_pivot.reset_index(inplace=True)

# WA_pivot

# occupation_level_skills_wa = pd.merge(df_skills, WA_pivot,  how='left', left_on=['O*NET-SOC Code'], right_on = ['O*NET-SOC Code'])
# occupation_level_skills_wa = occupation_level_skills_wa.sort_values(by = 'Observed Occupation')
# occupation_level_skills_wa

In [333]:
know = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Knowledge.txt'), sep='\t', low_memory=False)
know = know[know['Scale ID'] == 'IM']
# know['Element_pivot'] =  know['Element Name'] + " " +  know['Scale ID']
know_pivot = know.pivot(index = 'O*NET-SOC Code', columns='Element Name', values='Data Value').fillna(0)
know_pivot.reset_index(inplace=True)
print know_pivot.shape
know_pivot.head()

(964, 34)


Element Name,O*NET-SOC Code,Administration and Management,Biology,Building and Construction,Chemistry,Clerical,Communications and Media,Computers and Electronics,Customer and Personal Service,Design,...,Philosophy and Theology,Physics,Production and Processing,Psychology,Public Safety and Security,Sales and Marketing,Sociology and Anthropology,Telecommunications,Therapy and Counseling,Transportation
0,11-1011.00,4.75,1.34,2.11,1.43,2.66,2.7,2.23,4.09,2.05,...,1.7,1.23,2.63,3.1,3.3,3.23,2.61,1.76,2.0,2.21
1,11-1011.03,3.85,2.44,3.69,2.36,2.58,2.84,2.65,3.62,3.72,...,1.85,2.48,2.23,2.88,2.4,3.5,2.38,1.58,1.23,2.42
2,11-1021.00,4.35,1.5,2.51,1.95,3.51,2.59,3.33,3.95,2.38,...,1.51,1.81,3.39,2.72,3.1,3.47,1.81,2.5,1.77,2.2
3,11-2011.00,4.11,1.11,1.12,1.09,3.1,4.33,3.43,3.79,2.94,...,1.47,1.06,3.12,2.63,1.78,3.88,1.73,3.06,1.28,1.61
4,11-2021.00,4.04,1.1,1.19,1.21,3.01,3.8,3.51,3.85,3.15,...,1.64,1.18,2.46,2.45,2.5,4.85,2.03,2.86,1.63,1.68


In [334]:
abilities = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Abilities.txt'), sep='\t', low_memory=False)
abilities = abilities[abilities['Scale ID'] == 'IM']
#abilities['Element_pivot'] =  abilities['Element Name'] + " " +  abilities['Scale ID']
abilities_pivot = abilities.pivot(index = 'O*NET-SOC Code', columns='Element Name', values='Data Value').fillna(0)
abilities_pivot.reset_index(inplace=True)
print abilities_pivot.shape
abilities_pivot.head()


(964, 53)


Element Name,O*NET-SOC Code,Arm-Hand Steadiness,Auditory Attention,Category Flexibility,Control Precision,Deductive Reasoning,Depth Perception,Dynamic Flexibility,Dynamic Strength,Explosive Strength,...,Speed of Limb Movement,Stamina,Static Strength,Time Sharing,Trunk Strength,Visual Color Discrimination,Visualization,Wrist-Finger Speed,Written Comprehension,Written Expression
0,11-1011.00,1.0,2.12,3.5,1.75,4.12,1.75,1.0,1.0,1.0,...,1.0,1.0,1.0,3.0,1.0,1.88,3.12,1.0,4.25,4.12
1,11-1011.03,1.0,1.88,3.38,1.75,4.0,2.0,1.0,1.0,1.0,...,1.0,1.0,1.0,2.62,1.12,2.0,2.75,1.12,4.0,3.88
2,11-1021.00,2.0,2.12,3.0,1.75,3.75,2.0,1.0,1.62,1.5,...,1.5,2.0,2.0,2.88,2.12,2.0,2.75,1.38,4.0,4.0
3,11-2011.00,1.88,1.88,3.38,1.5,3.88,1.88,1.0,1.0,1.0,...,1.0,1.0,1.0,2.75,1.25,2.88,3.0,1.25,3.88,3.88
4,11-2021.00,1.12,1.88,3.25,1.0,3.88,1.75,1.0,1.25,1.0,...,1.0,1.0,1.0,2.75,1.75,2.88,3.0,1.62,4.0,3.88



# Skills by Occupations and Tasks 

In [145]:
#access the occupation skills vector like this:
# skills_pivot[skills_pivot['O*NET-SOC Code'] == '11-1011.00'].values[0][1:]

## Weight Skills by Importance of Task and Frequency of Task

In [148]:
# # Do Not Normalise by Occupation (rarther by DWA - above)
# task_im_sum = task_im.groupby('O*NET-SOC Code').sum().reset_index()
# task_im_sum.rename(columns = {'Task IM': 'IM Sum per Occu'}, inplace= True)
# task_im_sum = task_im_sum[['O*NET-SOC Code', 'IM Sum per Occu']]

# task_ims = pd.merge(task_im, task_im_sum,  how='left', left_on=['O*NET-SOC Code'], right_on = ['O*NET-SOC Code'])
# task_ims['Task IM Norm'] = task_ims['Task IM'] / task_ims['IM Sum per Occu'] 
# task_ims

In [158]:
# # Do Not Use Frequency: Bit hacky. 

task_freq = task_rates[task_rates.loc[:,('Scale ID')] == 'FT']

# Manually change Frequency Categories into Numeric value per Day ## A bit hacky :) 
time_categories = {'1' : (1/365.),  # Yearly or less
                   '2' : (4/365.),  # More than yearly
                   '3' : (3/12.),   # More than monthly
                   '4' : (3/7.),    # More than weekly
                   '5' : 1.,            # Daily
                   '6' : 3.,            # Several times daily
                   '7' : 8.}            # Hourly or more

# frequency is only relative:
task_freq.loc[:,('Temp')] = [float(time_categories[i]) for i in task_freq.loc[:,('Category')].values]
task_freq.loc[:,('Freq')] = task_freq.loc[:,('Temp')]*task_freq.loc[:,('Data Value')]

In [159]:
by = ['O*NET-SOC Code', 'Task ID']
task_freq = task_freq.groupby(by).mean().reset_index()
task_freq.rename(columns = {'Freq':'Task Freq'}, inplace = True)
cols = by + ['Task Freq']
task_freq = task_freq[cols]

task_freq_norm = task_freq.groupby('O*NET-SOC Code').sum().reset_index()
task_freq_norm.rename(columns = {'Task Freq': 'Sum per Occu'}, inplace= True)
task_freq_norm = task_freq_norm[['O*NET-SOC Code', 'Sum per Occu']]

task_freqs = pd.merge(task_freq, task_freq_norm,  how='left', left_on=['O*NET-SOC Code'], right_on = ['O*NET-SOC Code'])
task_freqs.loc[:, ('Task Freq Norm')] = task_freqs.loc[:, ('Task Freq')] / task_freqs.loc[:, ('Sum per Occu')]
# task_freqs.groupby('O*NET-SOC Code').sum() # check they sum to 1 :) 
# task_freqs[['O*NET-SOC Code', 'Task ID', 'Task Freq', 'Task Freq Norm']]

In [335]:
# print skills_pivot.shape[1] + know_pivot.shape[1] + abilities_pivot.shape[1]
all_features = skills_pivot.merge(know_pivot,on='O*NET-SOC Code').merge(abilities_pivot,on='O*NET-SOC Code')
print all_features.shape
all_features.head()

(964, 121)


Element Name,O*NET-SOC Code,Active Learning,Active Listening,Complex Problem Solving,Coordination,Critical Thinking,Equipment Maintenance,Equipment Selection,Installation,Instructing,...,Speed of Limb Movement,Stamina,Static Strength,Time Sharing,Trunk Strength,Visual Color Discrimination,Visualization,Wrist-Finger Speed,Written Comprehension,Written Expression
0,11-1011.00,4.0,4.12,4.38,4.25,4.38,1.0,1.12,1.0,3.12,...,1.0,1.0,1.0,3.0,1.0,1.88,3.12,1.0,4.25,4.12
1,11-1011.03,3.5,3.88,4.0,3.62,4.0,1.0,1.12,1.0,3.25,...,1.0,1.0,1.0,2.62,1.12,2.0,2.75,1.12,4.0,3.88
2,11-1021.00,3.5,4.0,3.5,4.0,3.88,1.0,1.0,1.0,3.12,...,1.5,2.0,2.0,2.88,2.12,2.0,2.75,1.38,4.0,4.0
3,11-2011.00,3.25,4.0,3.5,3.5,3.75,1.0,1.25,1.0,2.88,...,1.0,1.0,1.0,2.75,1.25,2.88,3.0,1.25,3.88,3.88
4,11-2021.00,3.88,3.88,3.62,3.5,3.88,1.0,1.0,1.0,3.0,...,1.0,1.0,1.0,2.75,1.75,2.88,3.0,1.62,4.0,3.88


In [336]:
cols = ['O*NET-SOC Code', 'DWA ID', 'Task IM Norm']
df6 = pd.merge(df5[cols], all_features,  how='left', left_on=['O*NET-SOC Code'], right_on = ['O*NET-SOC Code'])

# df6[df6['DWA ID'] == '4.A.4.a.8.I03.D05']['Task IM Norm'].sum()  # Check the IM weights sum to 1 over DWAs
print df6.shape
df6.head()

(22365, 123)


Unnamed: 0,O*NET-SOC Code,DWA ID,Task IM Norm,Active Learning,Active Listening,Complex Problem Solving,Coordination,Critical Thinking,Equipment Maintenance,Equipment Selection,...,Speed of Limb Movement,Stamina,Static Strength,Time Sharing,Trunk Strength,Visual Color Discrimination,Visualization,Wrist-Finger Speed,Written Comprehension,Written Expression
0,11-2022.00,4.A.4.a.8.I03.D05,0.048265,3.75,4.0,3.75,3.88,3.88,1.0,1.0,...,1.12,1.0,1.0,2.62,1.75,1.88,2.38,1.5,4.0,3.88
1,11-2022.00,4.A.1.a.1.I14.D04,0.105248,3.75,4.0,3.75,3.88,3.88,1.0,1.0,...,1.12,1.0,1.0,2.62,1.75,1.88,2.38,1.5,4.0,3.88
2,11-2022.00,4.A.4.b.4.I12.D03,0.074453,3.75,4.0,3.75,3.88,3.88,1.0,1.0,...,1.12,1.0,1.0,2.62,1.75,1.88,2.38,1.5,4.0,3.88
3,11-2022.00,4.A.2.b.4.I01.D06,0.125522,3.75,4.0,3.75,3.88,3.88,1.0,1.0,...,1.12,1.0,1.0,2.62,1.75,1.88,2.38,1.5,4.0,3.88
4,11-2022.00,4.A.2.a.4.I11.D06,0.219566,3.75,4.0,3.75,3.88,3.88,1.0,1.0,...,1.12,1.0,1.0,2.62,1.75,1.88,2.38,1.5,4.0,3.88


In [337]:
## Weight each skills vector by the Task Importance: 
func = lambda x: np.asarray(x) * np.asarray(df6['Task IM Norm'])
df7 = df6[df6.columns[3:]].apply(func)

In [338]:
df8 = pd.merge(df6[['DWA ID']], df7, how='left', left_index=True, right_index=True)

In [339]:
df9 = df8.groupby(['DWA ID'], as_index=False).mean()
print df9.shape
df9.head()

(2067, 121)


Unnamed: 0,DWA ID,Active Learning,Active Listening,Complex Problem Solving,Coordination,Critical Thinking,Equipment Maintenance,Equipment Selection,Installation,Instructing,...,Speed of Limb Movement,Stamina,Static Strength,Time Sharing,Trunk Strength,Visual Color Discrimination,Visualization,Wrist-Finger Speed,Written Comprehension,Written Expression
0,4.A.1.a.1.I01.D01,0.640012,0.760882,0.661468,0.650813,0.689551,0.272503,0.324083,0.211035,0.571916,...,0.216773,0.238842,0.255173,0.505297,0.296222,0.670489,0.706263,0.287415,0.754158,0.662799
1,4.A.1.a.1.I01.D02,0.770415,0.885816,0.780493,0.830887,0.856852,0.310548,0.344312,0.322658,0.762447,...,0.31094,0.359645,0.394422,0.646531,0.474791,0.532756,0.694413,0.473643,0.814686,0.805713
2,4.A.1.a.1.I01.D03,0.476539,0.535567,0.450439,0.52,0.508739,0.166667,0.277673,0.166667,0.431036,...,0.304242,0.353333,0.345064,0.378837,0.425406,0.548142,0.490909,0.318964,0.5,0.48
3,4.A.1.a.1.I01.D04,0.360029,0.433351,0.354674,0.392718,0.403872,0.123165,0.138594,0.119341,0.334015,...,0.160733,0.193622,0.190626,0.307669,0.217988,0.280556,0.338411,0.137912,0.424902,0.382535
4,4.A.1.a.1.I02.D01,0.92898,1.199915,0.914062,1.022204,1.080387,0.333333,0.360471,0.333333,0.75989,...,0.408454,0.454496,0.469035,0.871552,0.56202,0.65247,0.720315,0.639633,1.345932,1.246505


In [259]:
# # # Version 0.1 : 
# A defensible method for weighting Importance and Frequency of Skills vector per Occu 

# weighted_vecs = np.array([])
# weighted_vecs.shape
# problem_onet_codes = {"skill_vec":set([]), "IM/Freq" : set([]) }

# # try and get rid of this horrible loop. 
# for (cnt, row) in df3.iterrows():
#     if cnt%1000 ==0: print cnt,
    
#     #if task_freq['O*NET-SOC Code'].str.contains(row['O*NET-SOC Code']).any() and task_ims['O*NET-SOC Code'].str.contains(row['O*NET-SOC Code']).any() and skills_pivot['O*NET-SOC Code'].str.contains(row['O*NET-SOC Code']).any():
       
#     try: 
#         skill_vec = skills_pivot[skills_pivot['O*NET-SOC Code'] == row['O*NET-SOC Code']].values[0][1:]
#     except IndexError as e:
# #         print cnt, row['O*NET-SOC Code'], skills_pivot[skills_pivot['O*NET-SOC Code'] == row['O*NET-SOC Code']].values
#         problem_onet_codes["skill_vec"].add(row['O*NET-SOC Code'])
#         continue

#     freq = task_freqs[task_freqs['Task ID'] == row['Task ID']]['Task Freq Norm'].values
#     imp =  task_ims[task_ims['Task ID'] == row['Task ID']]['Task IM Norm'].values

#     try:
#         vec = skill_vec*freq*imp
#     except ValueError as e:
# #         print row['O*NET-SOC Code']
#         problem_onet_codes["IM/Freq"].add(row['O*NET-SOC Code'])
#         continue

#     task_info = np.append(row['Task ID'], row['DWA ID'])
#     weigted_skills = np.append(task_info, vec)

#     try: 
#         weighted_vecs = np.append(weighted_vecs, [weigted_skills], axis=0)
#     except:
#         weighted_vecs = np.array([weigted_skills])

# weighted_vecs.shape

In [260]:
# df4 = pd.DataFrame(data=weighted_vecs)
# task_cols = ['Task ID', 'DWA ID']

# skill_column_names = list(skills_pivot.columns[1:])
# df4.columns = np.append(task_cols, skill_column_names)
# skill_column_names
# df4[skill_column_names] = df4[skill_column_names].apply(pd.to_numeric)
# df4.head()

In [261]:
# # Take the mean average over occupations DWAs
# df5 = df4.groupby(['DWA ID'], as_index=False).mean()
# df5.head()

# GP the dataset using all skills

## Ground Truth Survey data:

In [340]:
survey_data_.rename(columns = {'title':'O*NET Occupation title', 
                              'Unnamed: 0': 'Task'}, inplace = True)

# Change Ordinal Data to Numeric - bit hacky
ratings = [4,3,2,1,0]
survey_data_['GT Rating'] = (survey_data_['Completely Automatable Today']*ratings[0] + survey_data_['Could be Mostly Automated Today (Human Still Needed)']*ratings[1] + survey_data_['Mostly Not Automatable Today (Human Does Most of It)']*ratings[2] + survey_data_['Not Automatable Today']*ratings[3] + survey_data_['Unsure']*ratings[4]) / survey_data_['Number of Responses']                
                
survey_data = survey_data_[['Task ID', 'GT Rating']].sort_values(by='Task ID')
print survey_data.count()

#Task DWAs (detailed work activitiy code):
taskDWA = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Tasks to DWAs.txt'), sep='\t')
taskDWA = taskDWA[['Task ID', 'DWA ID']]

print taskDWA.shape, "UNIQUE DWA: ", len(taskDWA['DWA ID'].unique()) 
task_dwa_rat = pd.merge(taskDWA, survey_data, how='left', left_on=['Task ID'], right_on = ['Task ID'])

DWA_mean_rating = task_dwa_rat.groupby(['DWA ID']).mean().reset_index().rename(columns = {'GT Rating':'DWA GT Rating'})
DWA_mean_rating = DWA_mean_rating[['DWA ID', 'DWA GT Rating']]

print "Unique DWAs Annotated = ", DWA_mean_rating[DWA_mean_rating['DWA GT Rating'].notnull()].shape

data = pd.merge(df9, DWA_mean_rating,  how='left', left_on=['DWA ID'], right_on = ['DWA ID'])
data.head()

Task ID      350
GT Rating    350
dtype: int64
(22838, 2) UNIQUE DWA:  2070
Unique DWAs Annotated =  (314, 2)


Unnamed: 0,DWA ID,Active Learning,Active Listening,Complex Problem Solving,Coordination,Critical Thinking,Equipment Maintenance,Equipment Selection,Installation,Instructing,...,Stamina,Static Strength,Time Sharing,Trunk Strength,Visual Color Discrimination,Visualization,Wrist-Finger Speed,Written Comprehension,Written Expression,DWA GT Rating
0,4.A.1.a.1.I01.D01,0.640012,0.760882,0.661468,0.650813,0.689551,0.272503,0.324083,0.211035,0.571916,...,0.238842,0.255173,0.505297,0.296222,0.670489,0.706263,0.287415,0.754158,0.662799,2.333333
1,4.A.1.a.1.I01.D02,0.770415,0.885816,0.780493,0.830887,0.856852,0.310548,0.344312,0.322658,0.762447,...,0.359645,0.394422,0.646531,0.474791,0.532756,0.694413,0.473643,0.814686,0.805713,
2,4.A.1.a.1.I01.D03,0.476539,0.535567,0.450439,0.52,0.508739,0.166667,0.277673,0.166667,0.431036,...,0.353333,0.345064,0.378837,0.425406,0.548142,0.490909,0.318964,0.5,0.48,
3,4.A.1.a.1.I01.D04,0.360029,0.433351,0.354674,0.392718,0.403872,0.123165,0.138594,0.119341,0.334015,...,0.193622,0.190626,0.307669,0.217988,0.280556,0.338411,0.137912,0.424902,0.382535,2.153846
4,4.A.1.a.1.I02.D01,0.92898,1.199915,0.914062,1.022204,1.080387,0.333333,0.360471,0.333333,0.75989,...,0.454496,0.469035,0.871552,0.56202,0.65247,0.720315,0.639633,1.345932,1.246505,


In [341]:
cols = np.append(['DWA ID', 'DWA GT Rating'], all_features.columns[1:])

X = data[data['DWA GT Rating'].notnull()]
test = data[data['DWA GT Rating'].isnull()].reset_index()[cols]
y = X['DWA GT Rating']

## Uncomment to see training dataset: 
# X.head()

Unnamed: 0,DWA ID,Active Learning,Active Listening,Complex Problem Solving,Coordination,Critical Thinking,Equipment Maintenance,Equipment Selection,Installation,Instructing,...,Stamina,Static Strength,Time Sharing,Trunk Strength,Visual Color Discrimination,Visualization,Wrist-Finger Speed,Written Comprehension,Written Expression,DWA GT Rating
0,4.A.1.a.1.I01.D01,0.640012,0.760882,0.661468,0.650813,0.689551,0.272503,0.324083,0.211035,0.571916,...,0.238842,0.255173,0.505297,0.296222,0.670489,0.706263,0.287415,0.754158,0.662799,2.333333
3,4.A.1.a.1.I01.D04,0.360029,0.433351,0.354674,0.392718,0.403872,0.123165,0.138594,0.119341,0.334015,...,0.193622,0.190626,0.307669,0.217988,0.280556,0.338411,0.137912,0.424902,0.382535,2.153846
11,4.A.1.a.1.I02.D08,0.165224,0.190542,0.1798,0.163266,0.191568,0.088022,0.098921,0.072981,0.148719,...,0.070115,0.076516,0.142507,0.08999,0.149631,0.173329,0.090978,0.201805,0.187325,3.4
12,4.A.1.a.1.I02.D09,0.117664,0.134487,0.125536,0.133918,0.138725,0.095034,0.103691,0.075705,0.109398,...,0.120694,0.133348,0.11419,0.140046,0.119192,0.141038,0.087943,0.128867,0.112453,2.470085
13,4.A.1.a.1.I02.D10,0.188119,0.222286,0.204664,0.210808,0.223678,0.142135,0.116065,0.081526,0.177424,...,0.153741,0.173827,0.186136,0.1706,0.187453,0.194365,0.136334,0.217571,0.204275,3.0


# Save Data to file

In [342]:
## Uncomment to pickle somewhere: 

# save_this = (X, test, y)
# file_name = 'tasks_by_skills_v3.p'
# path = os.path.join(datasets, 'FoEmployment/Analysis_of_ONET_Tasks', file_name)
# f = open(path, "w")

# pickle.dump(save_this, f)
# f.close()
# print "saved here: %s " % path

saved here: /home/scpd/Datasets/FoEmployment/Analysis_of_ONET_Tasks/tasks_by_skills_v3.p 
