# Clean up ABS occupation

Purpose:
- clean the raw ABS occupation file into rows = postcodes; columns = occupations; column values = proportions of people doing this occupation in this region

In [1]:
import pandas as pd

In [None]:
# create directory persona if does not exist
import os

dirs_to_create = ['../data/curated/persona']

def create_dirs(dirs_to_create):
    # check if it exists as it makedir will raise an error if it does exist
    for dir_to_create in dirs_to_create:
        if not os.path.exists(dir_to_create):
            os.makedirs(dir_to_create)

create_dirs(dirs_to_create)

In [2]:
def inspect_dataframe(df):
    print("rows, columns:", df.shape)
    return df.head()

In [3]:
# read in data
occupation = pd.read_csv('../data/external/by_postcode/occupation.csv')
# note data has been previously hand-cleaned to take out bad formatting and useless columns for practical time saving purposes

inspect_dataframe(occupation)

rows, columns: (2653, 13)


Unnamed: 0,OCCP - 1 Digit Level,Managers,Professionals,Technicians and Trades Workers,Community and Personal Service Workers,Clerical and Administrative Workers,Sales Workers,Machinery Operators and Drivers,Labourers,Inadequately described,Not stated,Not applicable,Total
0,"2000, NSW",2183,3820,1690,2474,1307,1233,201,1812,229,180,12284,27411
1,"2006, NSW",22,139,9,225,69,81,0,25,5,4,675,1261
2,"2007, NSW",415,1034,369,572,395,399,101,339,32,49,5149,8846
3,"2008, NSW",687,2007,422,632,585,525,77,317,52,38,6369,11712
4,"2009, NSW",1604,2698,628,822,882,520,102,382,106,55,5012,12813


In [4]:
# pull out column currently containing noisy postcode 
postcode = occupation['OCCP - 1 Digit Level']

# clean the noisy postcode column to leave only postcode
occupation['postcode'] = [x.split(',')[0] for x in postcode]

# drop unnecessary rows
occupation = occupation.drop(['OCCP - 1 Digit Level', 'Not stated', 'Not applicable', 'Inadequately described'], axis = 1)

inspect_dataframe(occupation)

rows, columns: (2653, 10)


Unnamed: 0,Managers,Professionals,Technicians and Trades Workers,Community and Personal Service Workers,Clerical and Administrative Workers,Sales Workers,Machinery Operators and Drivers,Labourers,Total,postcode
0,2183,3820,1690,2474,1307,1233,201,1812,27411,2000
1,22,139,9,225,69,81,0,25,1261,2006
2,415,1034,369,572,395,399,101,339,8846,2007
3,687,2007,422,632,585,525,77,317,11712,2008
4,1604,2698,628,822,882,520,102,382,12813,2009


In [5]:
# previously types were str
occupation['Total'] = occupation['Total'].astype(int)

In [6]:
SKIP_COLS = ['Total', 'postcode'] # columns to be skipped

for col in occupation.columns:
    if col not in SKIP_COLS:
        # previously types were str
        occupation[col] = occupation[col].astype(int)
        # now take the proportion
        occupation[f'{col}_%'] = occupation[col]/occupation['Total'] 
        # drop the column after getting its proportion
        occupation = occupation.drop(col, axis=1)

# drop total
occupation = occupation.drop('Total', axis = 1)

inspect_dataframe(occupation)

rows, columns: (2653, 9)


Unnamed: 0,postcode,Managers_%,Professionals_%,Technicians and Trades Workers_%,Community and Personal Service Workers_%,Clerical and Administrative Workers_%,Sales Workers_%,Machinery Operators and Drivers_%,Labourers_%
0,2000,0.07964,0.13936,0.061654,0.090256,0.047682,0.044982,0.007333,0.066105
1,2006,0.017446,0.11023,0.007137,0.17843,0.054718,0.064235,0.0,0.019826
2,2007,0.046914,0.116889,0.041714,0.064662,0.044653,0.045105,0.011418,0.038322
3,2008,0.058658,0.171363,0.036031,0.053962,0.049949,0.044826,0.006574,0.027066
4,2009,0.125185,0.210567,0.049013,0.064154,0.068836,0.040584,0.007961,0.029813


In [7]:
# export to csv
occupation.to_csv('../data/curated/persona/occupation_cleaned.csv', index = False)