In [20]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from jupyterthemes import jtplot
from functools import reduce
import warnings
warnings.filterwarnings('ignore')

# jtplot.style(theme=’monokai’, context=’notebook’, ticks=True, grid=False)
# jt -t oceans16 -f fira -fs 11 -nf ptsans -nfs 13 -N -kl -cursw 5 -cursc r -cellw 85% -T

## Dataset Cleaning & Preprocessing

### 1. O* NET 

In [12]:
variables = {
    'Social_Perceptiveness.csv': 'SP',
    'Persuasion.csv': 'P',
    'Finger_Dexterity.csv': 'FD',
    'Originality.csv': 'O',
    'Assisting_and_Caring_for_Others.csv': 'AC',
    'Manual_Dexterity.csv': 'MD',
    'Cramped_Work_Space_Awkward_Positions.csv': 'CW',
    'Negotiation.csv': 'N',
    'Fine_Arts.csv': 'FA'
}

onetDB = pd.DataFrame()
for fileLoc, heading in variables.items():
    data = pd.read_csv("data/raw/" + fileLoc)
    if "Importance" in data.columns:
        data.rename(columns = {'Importance' : heading}, inplace=True)
        del data['Level']
    else: data.rename(columns={'Context': heading}, inplace=True)
        
    if onetDB.empty: onetDB = data.copy()
    else: onetDB = pd.merge(onetDB, data, on=['Code', 'Occupation'])
onetDB["Code"] = onetDB["Code"].apply(lambda str: str[:-3])
onetDB["FA"] = onetDB["FA"].apply(lambda value: value if value!='Not available' else '0')
onetDB = onetDB[['Code', 'Occupation','SP' ,'P', 'FD', 'O', 'AC', 'MD', 'CW', 'N', 'FA']]
onetDB = onetDB.astype({'Code':'str','Occupation':'str', 'FA':'int64'})

print("onetDB Dimensions =>", onetDB.shape)
onetDB.head()

onetDB Dimensions => (968, 11)


Unnamed: 0,Code,Occupation,SP,P,FD,O,AC,MD,CW,N,FA
0,19-3031,Counseling Psychologists,94,72,19,66,91,0,0,69,5
1,21-1014,Mental Health Counselors,94,66,28,60,94,0,3,56,15
2,19-3031,Clinical Psychologists,94,60,22,56,99,0,5,50,5
3,29-1066,Psychiatrists,88,63,35,56,99,0,6,56,5
4,21-1013,Marriage and Family Therapists,81,72,38,66,94,0,1,69,5


### 2. SOC

In [13]:
socDB = pd.read_csv("data/raw/all_data_M_2018.csv", usecols=["own_code", "occ_code", "occ_title",
                                                          "tot_emp", "loc_quotient", "pct_total", "h_mean", "a_mean", "h_median", "a_median"])
socDB = socDB.drop(['loc_quotient','pct_total','h_median','a_median'], axis=1)
socDB.rename(columns={'own_code':'Industry Code','occ_code':'Code','tot_emp':'Total Emp', 
                      'h_mean': 'Hourly Income', 'a_mean':'Annual Income','occ_title':'Occupation'}, inplace=True)
socDB = socDB.drop_duplicates(subset=['Occupation'])
delComma = lambda no: no.replace(',','') if no != '*' else None
socDB['Total Emp'] = socDB['Total Emp'].apply(delComma)
socDB['Annual Income'] = socDB['Annual Income'].apply(delComma)
socDB['Hourly Income'] = socDB['Hourly Income'].apply(delComma)
socDB = socDB.astype({'Industry Code':'str', 'Total Emp':'int64', 'Hourly Income':'float64', 'Annual Income': 'float64' })
socDB = socDB[['Code', 'Occupation','Industry Code' ,'Total Emp', 'Hourly Income','Annual Income']]

print("socDB Dimensions =>", socDB.shape)
socDB.head()

socDB Dimensions => (1103, 6)


Unnamed: 0,Code,Occupation,Industry Code,Total Emp,Hourly Income,Annual Income
0,00-0000,All Occupations,1235,144733270,24.98,51960.0
1,11-0000,Management Occupations,1235,7616650,58.44,121560.0
2,11-1000,Top Executives,1235,2535640,61.66,128240.0
3,11-1010,Chief Executives,1235,195530,96.22,200140.0
5,11-1020,General and Operations Managers,1235,2289770,59.56,123880.0


### 3. Oxford

In [14]:
oxfordDB = pd.read_csv("data/raw/oxford_data.csv")
oxfordDB.rename(columns={'code': "Code", 'occupation': 'Occupation', 'rank':'Rank', 'probability':'Probability','label':'Label'}, inplace=True)
oxfordDB = oxfordDB[['Code', 'Occupation','Rank', 'Probability', 'Label']]

print("oxfordDB Dimensions =>", oxfordDB.shape)
oxfordDB.head()

oxfordDB Dimensions => (702, 5)


Unnamed: 0,Code,Occupation,Rank,Probability,Label
0,29-1125,Recreational Therapists,1,0.0028,
1,49-1011,"First-Line Supervisors of Mechanics, Installer...",2,0.003,
2,11-9161,Emergency Management Directors,3,0.003,
3,21-1023,Mental Health and Substance Abuse Social Workers,4,0.0031,
4,29-1181,Audiologists,5,0.0033,


## Data Collection

### 1. ONET + SOC + Oxford

In [15]:
datasets = [onetDB, socDB[socDB.columns.difference(['Occupation'])], oxfordDB[oxfordDB.columns.difference(['Occupation'])]]
pdMerge = lambda left,right: pd.merge(left,right,on='Code')
mergedDB = reduce(pdMerge, datasets)

print("onetDB{0} | socDB{1} | oxfordDB{2}".format(onetDB.shape,socDB.shape,oxfordDB.shape), end='\n\n')
print("mergedDB Dimensions =>",mergedDB.shape)
mergedDB.head()

onetDB(968, 11) | socDB(1103, 6) | oxfordDB(702, 5)

mergedDB Dimensions => (502, 18)


Unnamed: 0,Code,Occupation,SP,P,FD,O,AC,MD,CW,N,FA,Annual Income,Hourly Income,Industry Code,Total Emp,Label,Probability,Rank
0,19-3031,Counseling Psychologists,94,72,19,66,91,0,0,69,5,85340.0,41.03,1235,110490,,0.0047,24
1,19-3031,Clinical Psychologists,94,60,22,56,99,0,5,50,5,85340.0,41.03,1235,110490,,0.0047,24
2,19-3031,School Psychologists,75,63,22,50,62,0,6,60,7,85340.0,41.03,1235,110490,,0.0047,24
3,21-1013,Marriage and Family Therapists,81,72,38,66,94,0,1,69,5,54150.0,26.03,1235,48520,0.0,0.014,62
4,19-3039,Neuropsychologists and Clinical Neuropsycholog...,81,53,35,53,89,22,9,47,3,95610.0,45.97,1235,13480,,0.0043,17


### 2.1 ONET + SOC 

In [16]:
onetsocDB = pd.merge(onetDB,socDB[socDB.columns.difference(['Occupation'])] , on='Code')

print("onetDB{0} | socDB{1}".format(onetDB.shape,socDB.shape), end='\n\n')
print("onetsocDB Dimensions =>",onetsocDB.shape)
onetsocDB.head()

onetDB(968, 11) | socDB(1103, 6)

onetsocDB Dimensions => (608, 15)


Unnamed: 0,Code,Occupation,SP,P,FD,O,AC,MD,CW,N,FA,Annual Income,Hourly Income,Industry Code,Total Emp
0,19-3031,Counseling Psychologists,94,72,19,66,91,0,0,69,5,85340.0,41.03,1235,110490
1,19-3031,Clinical Psychologists,94,60,22,56,99,0,5,50,5,85340.0,41.03,1235,110490
2,19-3031,School Psychologists,75,63,22,50,62,0,6,60,7,85340.0,41.03,1235,110490
3,29-1066,Psychiatrists,88,63,35,56,99,0,6,56,5,220380.0,105.95,1235,25630
4,21-1013,Marriage and Family Therapists,81,72,38,66,94,0,1,69,5,54150.0,26.03,1235,48520


### 2.2 ONET + Oxford

In [21]:
onetoxfordDB = pd.merge(onetDB,oxfordDB[oxfordDB.columns.difference(['Occupation'])] , on='Code')
onetoxfordDB = onetoxfordDB[['Code', 'Occupation', 'SP', 'P', 'FD', 'O', 'AC', 'MD', 'CW', 'N', 'FA',
       'Rank', 'Probability' , 'Label' ]]

print("onetDB{0} | oxfordDB{1}".format(onetDB.shape,oxfordDB.shape), end='\n\n')
print("onetoxfordDB Dimensions =>",onetoxfordDB.shape)
onetoxfordDB.head()

onetDB(968, 11) | oxfordDB(702, 5)

onetoxfordDB Dimensions => (844, 14)


Unnamed: 0,Code,Occupation,SP,P,FD,O,AC,MD,CW,N,FA,Rank,Probability,Label
0,19-3031,Counseling Psychologists,94,72,19,66,91,0,0,69,5,24,0.0047,
1,19-3031,Clinical Psychologists,94,60,22,56,99,0,5,50,5,24,0.0047,
2,19-3031,School Psychologists,75,63,22,50,62,0,6,60,7,24,0.0047,
3,21-1014,Mental Health Counselors,94,66,28,60,94,0,3,56,15,25,0.0048,
4,21-1013,Marriage and Family Therapists,81,72,38,66,94,0,1,69,5,62,0.014,0.0


In [24]:
ML = onetoxfordDB[['Code', 'Occupation', 'SP', 'P', 'FD', 'O', 'AC', 'MD', 'CW', 'N', 'FA', 'Probability']]
ML.to_csv('data/')

Unnamed: 0,Code,Occupation,SP,P,FD,O,AC,MD,CW,N,FA,Probability
0,19-3031,Counseling Psychologists,94,72,19,66,91,0,0,69,5,0.0047
1,19-3031,Clinical Psychologists,94,60,22,56,99,0,5,50,5,0.0047
2,19-3031,School Psychologists,75,63,22,50,62,0,6,60,7,0.0047
3,21-1014,Mental Health Counselors,94,66,28,60,94,0,3,56,15,0.0048
4,21-1013,Marriage and Family Therapists,81,72,38,66,94,0,1,69,5,0.0140
...,...,...,...,...,...,...,...,...,...,...,...,...
839,53-7111,Mine Shuttle Car Operators,28,19,28,19,58,66,84,19,1,0.3700
840,51-6042,Shoe Machine Operators and Tenders,28,22,53,28,34,60,7,19,17,0.9700
841,45-2041,"Graders and Sorters, Agricultural Products",28,22,50,10,36,56,8,19,5,0.4100
842,51-6021,"Pressers, Textile, Garment, and Related Materials",25,19,53,22,50,66,17,19,7,0.8100


### 2.3 SOC + Oxford

In [48]:
socoxfordDB = pd.merge(socDB[socDB.columns.difference(['Occupation'])],oxfordDB[oxfordDB.columns.difference(['Occupation'])] , on='Code')

print("socDB{0} | oxfordDB{1}".format(socDB.shape,oxfordDB.shape), end='\n\n')
print("socoxfordDB Dimensions =>",socoxfordDB.shape)
socoxfordDB.head()

socDB(1103, 6) | oxfordDB(702, 5)

socoxfordDB Dimensions => (419, 8)


Unnamed: 0,Annual Income,Code,Hourly Income,Industry Code,Total Emp,Label,Probability,Rank
0,147240.0,11-2021,70.79,1235,240440,,0.014,61
1,140320.0,11-2022,67.46,1235,379050,,0.013,59
2,53990.0,11-9031,25.96,1235,50650,0.0,0.015,71
3,98750.0,11-9032,,1235,263120,,0.0046,22
4,111210.0,11-9033,53.47,1235,143430,,0.01,52


![](data/venn.png)