In [1]:
# Load packages

import pandas as pd
import altair as alt
from vega_datasets import data 
import numpy

# Need to enable this to allow work with larger datasets (https://altair-viz.github.io/user_guide/faq.html)
alt.data_transformers.enable('json')

DataTransformerRegistry.enable('json')

In [2]:
#reassigning for easier debugging
df = data.jobs()
# remove `perc` column
df = df.drop(columns = 'perc')

grouped_jobs = df.pivot_table(
  columns=['sex'], 
  values=['count'], 
  index=['year', 'job']
)

# Categorize each row into a gender dominant group 
# based on its male to female ratio of that year
def categorize_gender_dominant_group(row):  
    male_count = row['count']['men']
    female_count = row['count']['women']
    if male_count == 0 and female_count == 0:
        return 'no job'
    elif row['count']['women'] == 0:
        return 'only male'
    elif row['count']['men'] == 0:
        return 'only female'
    elif (male_count/female_count) >= 2:
        return 'male dominant'
    elif (male_count/female_count) <= 0.5:
        return 'female dominant'
    else:
        return 'balanced'

# Compute the difference in proportion of men and women by job by year
def compute_female_prop(row):  
    male_count = row['count']['men']
    female_count = row['count']['women']
    
    if male_count+female_count == 0:
        return float('NaN')
    else:
        return female_count/(male_count+female_count)

grouped_jobs.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,count
Unnamed: 0_level_1,sex,men,women
year,job,Unnamed: 2_level_2,Unnamed: 3_level_2
1850,Accountant / Auditor,708,0
1850,Actor,506,0
1850,Advertising Agent,0,0
1850,Agent,2528,0
1850,Apprentice,3134,101


In [3]:
# Compute the difference in proportion of men and women by job by year
def compute_total_prop_female(row):  
    male_total = row['count']['men']
    female_total = row['count']['women']
    return (female_total)/(male_total+female_total)

total_prop_female_df = pd.DataFrame({'total_prop_female':grouped_jobs.groupby(['job']).sum().apply(compute_total_prop_female, axis=1)}).reset_index()
total_prop_female_df.head()

Unnamed: 0,job,total_prop_female
0,Accountant / Auditor,0.435078
1,Actor,0.431113
2,Advertising Agent,0.409983
3,Agent,0.144929
4,Apprentice,0.112994


In [4]:
grouped_jobs['gender_dominant_group'] = grouped_jobs.apply(categorize_gender_dominant_group, axis=1)
grouped_jobs['female_prop'] = grouped_jobs.apply(compute_female_prop, axis=1)

grouped_jobs.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,count,count,gender_dominant_group,female_prop
Unnamed: 0_level_1,sex,men,women,Unnamed: 4_level_1,Unnamed: 5_level_1
year,job,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1850,Accountant / Auditor,708,0,only male,0.0
1850,Actor,506,0,only male,0.0
1850,Advertising Agent,0,0,no job,
1850,Agent,2528,0,only male,0.0
1850,Apprentice,3134,101,male dominant,0.031221


In [5]:
jobs_tidy = grouped_jobs.drop(['count'], axis=1)
jobs_tidy = jobs_tidy.reset_index()
jobs_tidy.columns=jobs_tidy.columns.droplevel(1)
# remove any rows with no observations (the next chunk will then remove these jobs completely)
jobs_tidy = jobs_tidy.query("gender_dominant_group != 'no job'")
# remove jobs that do not have data for both genders for all 15 years
full_year_data = jobs_tidy.groupby(["job"]).count().query("year == 15").reset_index()[["job"]]
jobs_tidy = jobs_tidy[jobs_tidy["job"].isin(full_year_data["job"])]
total_prop_female_df = total_prop_female_df[total_prop_female_df["job"].isin(full_year_data["job"])]
jobs_tidy.head()

Unnamed: 0,year,job,gender_dominant_group,female_prop
15,1850,Architect,only male,0.0
16,1850,Artist / Art Teacher,only male,0.0
19,1850,Author,only male,0.0
21,1850,Baker,male dominant,0.00689
23,1850,Barber / Beautician,male dominant,0.025612


In [6]:
# jobs_tidy.to_csv('jobs_tidy.csv')
jobs_tidy.head()


Unnamed: 0,year,job,gender_dominant_group,female_prop
15,1850,Architect,only male,0.0
16,1850,Artist / Art Teacher,only male,0.0
19,1850,Author,only male,0.0
21,1850,Baker,male dominant,0.00689
23,1850,Barber / Beautician,male dominant,0.025612


In [7]:
# total_gender_prop_diffs_df.to_csv('job_prop_diffs.csv')
total_prop_female_df.head()


Unnamed: 0,job,total_prop_female
15,Architect,0.124487
16,Artist / Art Teacher,0.464289
19,Author,0.48396
21,Baker,0.251936
23,Barber / Beautician,0.639604


In [13]:
top_10_male_dominated_jobs = total_prop_female_df.sort_values(by = 'total_prop_female', ascending = True).head(10)
top_10_male_dominated_jobs = top_10_male_dominated_jobs.merge(jobs_tidy, how = 'inner', on = 'job')
top_10_male_dominated_jobs.to_csv('../data/top_10_male_jobs.csv')
top_10_male_dominated_jobs.head()

Unnamed: 0,job,total_prop_female,year,gender_dominant_group,female_prop
0,Railroad Brakeman,0.006082,1850,only male,0.0
1,Railroad Brakeman,0.006082,1860,only male,0.0
2,Railroad Brakeman,0.006082,1870,only male,0.0
3,Railroad Brakeman,0.006082,1880,only male,0.0
4,Railroad Brakeman,0.006082,1900,male dominant,0.001405


In [12]:
top_10_balanced_jobs = total_prop_female_df.sort_values(by = 'total_prop_female', ascending = False).query("total_prop_female < 0.56 and total_prop_female > 0.44")
top_10_balanced_jobs = top_10_balanced_jobs.merge(jobs_tidy, how = 'inner', on = 'job')
top_10_balanced_jobs.to_csv('../data/top_10_balanced_jobs.csv')
top_10_balanced_jobs.head()

Unnamed: 0,job,total_prop_female,year,gender_dominant_group,female_prop
0,Designer,0.536786,1850,only male,0.0
1,Designer,0.536786,1860,only male,0.0
2,Designer,0.536786,1870,only male,0.0
3,Designer,0.536786,1880,only male,0.0
4,Designer,0.536786,1900,male dominant,0.078878


In [14]:
top_10_female_dominated_jobs = total_prop_female_df.sort_values(by = 'total_prop_female', ascending = True).tail(10)
top_10_female_dominated_jobs = top_10_female_dominated_jobs.merge(jobs_tidy, how = 'inner', on = 'job')
top_10_female_dominated_jobs.to_csv('../data/top_10_female_jobs.csv')
top_10_female_dominated_jobs.head()


Unnamed: 0,job,total_prop_female,year,gender_dominant_group,female_prop
0,Barber / Beautician,0.639604,1850,male dominant,0.025612
1,Barber / Beautician,0.639604,1860,male dominant,0.011077
2,Barber / Beautician,0.639604,1870,male dominant,0.043245
3,Barber / Beautician,0.639604,1880,male dominant,0.064068
4,Barber / Beautician,0.639604,1900,male dominant,0.056594
