In [13]:
import numpy as np
import pandas as pd
import os

In [50]:
#Utility function to delete last 3 characters from a string
def deleteChars(str):
    return str[:-3]

#Inner join wrapper function on two datasets
def dfIntersection(dataframe1, dataframe2):
    return pd.merge(dataframe1, dataframe2, on=["Code"], how="inner")

# Preprocessing + Combining Multiple Variables
def fetchOnetDB(variables):
    onetDB = pd.DataFrame()
    for location,heading in variables.items():
        data = pd.read_csv("data/"+location)
        if "Importance" in data.columns: 
            data.rename(columns={'Importance': heading}, inplace=True)
            del data['Level']
        if "Context" in data.columns: data.rename(columns={'Context': heading}, inplace=True)
        if onetDB.empty: onetDB = data.copy()
        else: onetDB = pd.merge(onetDB,data, on=['Code','Occupation'])
    cols = onetDB.columns.to_list()
    onetDB = onetDB[cols[1:3] + [cols[0]] + cols[3:]]
    onetDB["Code"] = onetDB["Code"].apply(deleteChars)
    return onetDB.copy()

#Function to fetch required columns from SOC data
def fetchSOCDB():
    socDb = pd.DataFrame()
    df = pd.read_csv("data/soc_data_2019.csv", usecols = ["own_code", "occ_code", "occ_title", "tot_emp", "jobs_1000_orig", "loc_quotient", "pct_total", "h_mean", "a_mean", "h_median", "a_median"])
    df.rename(columns={'occ_code': "Code"}, inplace=True)
    df = df.drop_duplicates(subset=["occ_title"])
    return df

#Fetch Ofxord research data
def fetchOxfordDB():
    ofxfordDB = pd.DataFrame()
    df = pd.read_csv("data/oxford_data.csv")
    df.rename(columns={'code': "Code"}, inplace=True)
    return df

In [42]:
onetVars = {
    'Social_Perceptiveness.csv':'SP',
    'Persuasion.csv':'P',
    'Finger_Dexterity.csv':'FD',
    'Originality.csv':'O',
    'Assisting_and_Caring_for_Others.csv':'AC',
    'Manual_Dexterity.csv':'MD',
    'Cramped_Work_Space_Awkward_Positions.csv':'CW',
    'Negotiation.csv':'N',
    'Fine_Arts.csv':'FA'
}

onetDB = fetchOnetDB(onetVars)

In [47]:
onetDB.loc[onetDB['Code'] == '15-1132']

Unnamed: 0,Code,Occupation,SP,P,FD,O,AC,MD,CW,N,FA
643,15-1132,"Software Developers, Applications",47,47,38,56,32,25,5,44,0


In [43]:
onetDB.head()

Unnamed: 0,Code,Occupation,SP,P,FD,O,AC,MD,CW,N,FA
0,19-3031,Counseling Psychologists,94,72,19,66,91,0,0,69,5
1,21-1014,Mental Health Counselors,94,66,28,60,94,0,3,56,15
2,19-3031,Clinical Psychologists,94,60,22,56,99,0,5,50,5
3,29-1066,Psychiatrists,88,63,35,56,99,0,6,56,5
4,21-1013,Marriage and Family Therapists,81,72,38,66,94,0,1,69,5


In [29]:
socDB = fetchSOCDB()
print("SOC Data Dimensions - "+str(socDB.shape))

  if (yield from self.run_code(code, result)):


SOC Data Dimensions - (1064, 11)


Unnamed: 0,own_code,Code,occ_title,tot_emp,jobs_1000_orig,loc_quotient,pct_total,h_mean,a_mean,h_median,a_median
0,1235,11-0000,Management Occupations,8054120,,,,58.88,122480,50.8,105660
1,1235,13-0000,Business and Financial Operations Occupations,8183750,,,,37.56,78130,33.57,69820
2,1235,15-0000,Computer and Mathematical Occupations,4552880,,,,45.08,93760,42.47,88340
3,1235,17-0000,Architecture and Engineering Occupations,2592680,,,,42.69,88800,39.15,81440
4,1235,19-0000,"Life, Physical, and Social Science Occupations",1288920,,,,37.28,77540,32.77,68160


In [46]:
ONETSOCDB = dfIntersection(socDB, onetDB)
print("ONET + SOC data intersection Dimensions - "+str(ONETSOCDB.shape))


ONET + SOC data intersection Dimensions - (793, 21)


In [68]:
oxfordDB = fetchOxfordDB()
print("Oxford data Dimensions - "+str(oxfordDB.shape))

Oxford data Dimensions - (702, 5)


In [67]:
onetOxfordDB = dfIntersection(oxfordDB, onetDB)
#type(onetOxfordDB)
onetOxfordDB = onetOxfordDB.loc[onetOxfordDB["label"] == 1]
onetOxfordDB.shape


(42, 15)