Map each observed healthcare task onto an ONET DWA 
---

- use word2vec to augment the observed task descriptions in order to propse the most similar DWAs (using a string matching function to the Tasks within the DWA). 



By Paul Duckworth 8th Dec 2017.



## OBSERVED TASK DATA

In [1]:
import os
import numpy as np
import pandas as pd
import getpass
import matplotlib.pyplot as plt
import gensim
# import pymed
import time
%matplotlib inline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

datasets = '/home/'+ getpass.getuser() +'/Datasets/'
print datasets

/home/scpd/Datasets/


In [2]:
d1 = os.path.join(datasets, 'FoHealthcare/FOH Occupations Tasks Features Technology.xlsx')

excel_doc = pd.ExcelFile(d1)
dataset = excel_doc.parse("Title, Tasks, Features").fillna("-")
dataset.rename(columns = {'Occupation title':'Observed Occupation'}, inplace = True)
dataset['Task ID'] = dataset.index
data = dataset[['Observed Occupation', 'Task', 'Task ID']]
data.head()

Unnamed: 0,Observed Occupation,Task,Task ID
0,Administrator,Medical Coding,0
1,Administrator,Answering Phones,1
2,Administrator,Register new Patients,2
3,Administrator,Use Intellisense to OCR letters and pick out c...,3
4,Administrator,Child immunization targets in open exeter,4


In [3]:
# # Test dataset with added Task "context".
# d2 = os.path.join(datasets, 'FoHealthcare/expanded tasks descriptions_for matching DWAs.csv')
# data = pd.read_csv(d2)
# data.rename(columns = {'Occupation title':'Observed Occupation', 'Task keywords/context':'Context'}, inplace = True)
# data['Task ID'] = data.index
# data.head()

## Observed Vocabulary: 

In [4]:
cv1 = CountVectorizer(stop_words='english') #max_df=0.95, min_df=2, max_features=n_features))

# Each Task is represented by a vector of Words over vocabulary
observed_tf = cv1.fit_transform(data['Task'].values)
observed_vocab = cv1.get_feature_names()

print data['Task'][0]
print observed_tf[0]
print len(observed_vocab), observed_vocab[:100]

Medical Coding
  (0, 48)	1
  (0, 152)	1
289 [u'accounting', u'address', u'addressing', u'admin', u'administer', u'advice', u'allergy', u'ambulance', u'answer', u'answering', u'appointment', u'approve', u'arise', u'assign', u'asthma', u'attend', u'audio', u'audit', u'authority', u'blood', u'bloodpressure', u'bloods', u'body', u'bood', u'book', u'bookable', u'building', u'called', u'canal', u'care', u'caretaking', u'case', u'cases', u'certain', u'cessation', u'changes', u'chatting', u'check', u'checking', u'checks', u'checkups', u'child', u'chronic', u'cleaning', u'clinical', u'clinics', u'clinicts', u'cloud', u'coding', u'colleagues', u'comment', u'communicate', u'conditions', u'conduct', u'connected', u'connecting', u'consult', u'consultation', u'counseling', u'cqrs', u'create', u'creating', u'data', u'dbs', u'declaration', u'deductions', u'desk', u'diagnostics', u'different', u'discuss', u'distchange', u'docmail', u'docman', u'doctors', u'document', u'documents', u'does', u'ear', u'ec

## ONET TASK DATA

In [5]:
# #Task DWAs (detailed work activitiy code):
taskDWA = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Tasks to DWAs.txt'), sep='\t')
DWArefs = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/DWA Reference.txt'), sep='\t')
DWA_sup = taskDWA[['Task ID', 'DWA ID']].merge(DWArefs[['DWA ID', 'IWA ID', 'DWA Title']], on=['DWA ID'])

# onet_tasks_dwa = onet_tasks[['Task ID', 'Task']].merge(DWA_sup, on=['Task ID'])\
#                                                          .sort_values(by='Task ID')\
#                                                          .reset_index().drop('index', axis=1)
# print "Merged Tasks = ", onet_tasks_dwa.shape

# Just ONET Tasks
onet_tasks = pd.read_table(os.path.join(datasets, 'ONET/databases/db2016/Task Statements.txt'), sep='\t')
print "all Tasks = ", onet_tasks.shape
onet_tasks = onet_tasks[['Task ID', 'Task']]

onet_tasks=onet_tasks.sort_values(by='Task ID').reset_index().drop('index', axis=1)
onet_tasks.head()

all Tasks =  (19566, 7)


Unnamed: 0,Task ID,Task
0,1,Resolve customer complaints regarding sales an...
1,2,Monitor customer preferences to determine focu...
2,3,Direct and coordinate activities involving sal...
3,4,Determine price schedules and discount rates.
4,5,Review operational records and reports to proj...


## ONET Vocabulary: 

In [6]:
cv2 = CountVectorizer(stop_words='english') #max_df=0.95, min_df=2, max_features=n_features)#, )

# Each Task is represented by a vector of Words over vocabulary
onet_tf = cv2.fit_transform(onet_tasks['Task'].values)
onet_vocab = cv2.get_feature_names()

print onet_tasks['Task'][0]
print onet_tf[0]
print len(onet_vocab), onet_vocab[:100]

Resolve customer complaints regarding sales and service.
  (0, 9682)	1
  (0, 9399)	1
  (0, 8823)	1
  (0, 2125)	1
  (0, 2710)	1
  (0, 9083)	1
12085 [u'10', u'24', u'3d', u'4d', u'abandoned', u'abatement', u'abatements', u'abbreviations', u'abdominal', u'abilities', u'ability', u'ablation', u'able', u'abnormal', u'abnormalities', u'aboard', u'aboveground', u'abraders', u'abrading', u'abrasion', u'abrasions', u'abrasive', u'abrasives', u'abreast', u'abroad', u'abscesses', u'absence', u'absences', u'absenteeism', u'absorbers', u'absorbing', u'absorption', u'abstract', u'abstracting', u'abstracts', u'abundance', u'abuse', u'abused', u'academia', u'academic', u'academy', u'accelerant', u'accelerated', u'accelerator', u'accelerators', u'accenting', u'accept', u'acceptability', u'acceptable', u'acceptance', u'accepted', u'accepting', u'access', u'accessed', u'accesses', u'accessibility', u'accessible', u'accessing', u'accessories', u'accessory', u'accident', u'accidental', u'accidents', u'acco

## Model

- https://code.google.com/archive/p/word2vec/
- 3 million unique words and phrases that they trained on roughly 100 billion words from a Google News dataset.



In [7]:
# location = '/home/'+ getpass.getuser() +'/Software/GoogleNews-vectors-negative300.bin'
# model = gensim.models.KeyedVectors.load_word2vec_format(location, binary=True)

## Similarity between pairwise words in 2 vocabularies:

In [11]:
word_sims = np.zeros([len(observed_vocab), len(onet_vocab)])
for cnt, word in enumerate(observed_vocab):
    print cnt, word,
    ss = []
    for j in onet_vocab:
        s = 0.0
        try:
            s = model.similarity(word, j)
            if s < 0:
                s = 0.0
        except:
            pass
        ss.append(s)       
    word_sims[cnt] = ss    

 0 accounting 1 address 2 addressing 3 admin 4 administer 5 advice 6 allergy 7 ambulance 8 answer 9 answering 10 appointment 11 approve 12 arise 13 assign 14 asthma 15 attend 16 audio 17 audit 18 authority 19 blood 20 bloodpressure 21 bloods 22 body 23 bood 24 book 25 bookable 26 building 27 called 28 canal 29 care 30 caretaking 31 case 32 cases 33 certain 34 cessation 35 changes 36 chatting 37 check 38 checking 39 checks 40 checkups 41 child 42 chronic 43 cleaning 44 clinical 45 clinics 46 clinicts 47 cloud 48 coding 49 colleagues 50 comment 51 communicate 52 conditions 53 conduct 54 connected 55 connecting 56 consult 57 consultation 58 counseling 59 cqrs 60 create 61 creating 62 data 63 dbs 64 declaration 65 deductions 66 desk 67 diagnostics 68 different 69 discuss 70 distchange 71 docmail 72 docman 73 doctors 74 document 75 documents 76 does 77 ear 78 ecg 79 email 80 emails 81 emis 82 emotional 83 enhanced 84 enter 85 ereferral 86 errors 87 examination 88 examinations 89 exeter 90 e

## Average Similarity between observed tasks and ONET tasks: 

In [12]:
print onet_tasks['Task'].values[0]
print np.mean(word_sims[0][onet_tf[0].indices])
print  data['Task'][0]

Resolve customer complaints regarding sales and service.
0.14237969786325141
Medical Coding


In [13]:
word_sims[[ 40, 111,  12,  20,  35,  33,  42, 126 , 16, 113]]  #.T[[0,10]]
print observed_tf[0].indices
print onet_tf[0].indices

[ 48 152]
[9682 9399 8823 2125 2710 9083]


In [14]:
print np.mean( word_sims[[ 5, 17]].T[[9682, 9399, 8823, 2125, 2710, 9083]])
sum(sum(word_sims[[ 5, 17]].T[[9682, 9399, 8823, 2125, 2710, 9083]])) / 12.

0.14785608171242753


0.1478560817124275

In [17]:
    
# onet_tasks_dwa = onet_tasks[['Task ID', 'Sims']].merge(DWA_sup, on=['Task ID'])\
#                                                         .sort_values(by='Task ID')\
#                                                         .reset_index().drop('index', axis=1)
        
# x = onet_tasks_dwa.groupby(['DWA ID', 'DWA Title']).mean().reset_index()[['DWA ID', 'DWA Title', 'Sims']].sort_values(by='Sims',ascending=False)[:10]
# x


In [15]:
## Output Format:
n_keep_dwas = 40 
columns = ['Occupation', 'Task ID', 'Task', 'DWA Title', 'Please Select', 'Relevance \nScore', 'DWA ID'] 
xls_path = os.path.join(datasets, 'FoHealthcare/recommended_DWA_matches.xlsx')
writer = pd.ExcelWriter(xls_path, engine='xlsxwriter')

## Big Loop over the observed Tasks
for task_cnt, obs_task in enumerate(observed_tf):  
#     if task_cnt > 2: continue
    
    print task_cnt, 
    
    ## Get rows corresponding to the observed task
    row_inds = obs_task.indices
    
    onet_similarities = []
    for onet_task in onet_tf:
        
        ## Get the columns corresponding to the words in the ONET tasks
        cols = onet_task.indices
        onet_similarities.append(np.mean(word_sims[row_inds].T[cols]))
        
    ## Merge (Overwrite) the similarity of the observed task onto the ONET dataframe
    onet_tasks['Sims'] = onet_similarities

    ## Merge DWA attributes on
    onet_tasks_dwa = onet_tasks[['Task ID', 'Sims']].merge(DWA_sup, on=['Task ID'])\
                                                            .sort_values(by='Task ID')\
                                                            .reset_index().drop('index', axis=1)

    ## Average Similarity over DWAs
    x = onet_tasks_dwa.groupby(['DWA ID', 'DWA Title']).mean()\
                                                    .reset_index()\
                                                    [['DWA ID', 'DWA Title', 'Sims']]\
                                                    .sort_values(by='Sims',ascending=False)[:n_keep_dwas]

    observed_task = data['Task'].values[task_cnt]
    observed_occu = data['Observed Occupation'].values[task_cnt]
    observed_id = data['Task ID'].values[task_cnt]
    
    print (observed_occu, observed_id, observed_task)
    ## Create the first output row: 
    ms = [(observed_occu, observed_id, observed_task, 
           x['DWA Title'].values[0], "-", x['Sims'].values[0], x['DWA ID'].values[0] )] 

    ## Create the subsequent output rows: 
    for cnt, (i, dwa_id, dwa, s) in enumerate(x.itertuples()):
        if cnt == 0: continue

        ms.extend([('-', '-', '-',
            x['DWA Title'].values[cnt], "-",  x['Sims'].values[cnt], 
            x['DWA ID'].values[cnt] )])

    ## Create a tab in the excel document 
    df_ = pd.DataFrame(data = np.array(ms), columns=columns)
    df_.to_excel(writer, '%s' % task_cnt)
    
    ## Format the Excel Sheet: 
    workbook  = writer.book
    format = workbook.add_format()
    format.set_text_wrap() # wraps text

    worksheet = writer.sheets['%s' % task_cnt]
    worksheet.set_row(0, 30)  # set the height of the first row
    worksheet.set_row(1, 70)  # set the height of the first row
    
    worksheet.set_column('A:A', 5, format)  #formats a column and specifies width
    worksheet.set_column('B:B', 20, format)
    worksheet.set_column('C:C', 10, format)
    worksheet.set_column('D:D', 45, format)
    worksheet.set_column('E:E', 60, format)
    worksheet.set_column('F:F', 10, None)
    worksheet.set_column('G:G', 10, None)
    worksheet.set_column('H:H', 10, None)
    
writer.save()
print "finished"

ImportError: No module named xlsxwriter

In [11]:
# list1, list2 = zip(*sorted(zip(matched_tasks[t], onet_tasks['Task'].values), reverse=True))