In [86]:
!pip install pandas
!pip install gender-guesser



In [113]:
import pandas as pd
import numpy as np
import gender_guesser.detector as gender
import random

#### Load Datasets

In [88]:
baby_names = pd.read_csv("../data/Popular_Baby_Names_20240424.csv")
expense_budget = pd.read_csv("../data/Expense_Budget_20240424.csv", low_memory=False)
payroll = pd.read_csv("../data/Citywide_Payroll_Data__Fiscal_Year__20240424.csv", low_memory=False)

#### Data Cleaning

##### Payroll

In [92]:
payroll.isnull().any(axis=0)

Fiscal Year                   False
Payroll Number                 True
Agency Name                   False
Last Name                      True
First Name                     True
Mid Init                       True
Agency Start Date              True
Work Location Borough          True
Title Description              True
Leave Status as of June 30    False
Base Salary                   False
Pay Basis                     False
Regular Hours                 False
Regular Gross Paid            False
OT Hours                      False
Total OT Paid                 False
Total Other Pay               False
dtype: bool

Filter out all the rows that are missing names. We can't do gender analysis on them. There are 13,245 rows with missing names. 

In [94]:
payroll = payroll[payroll["First Name"].notna()]

Filter out all the rows that are missing start dates. There are only 63 rows with missing start dates. 

In [104]:
payroll = payroll[payroll["Agency Start Date"].notna()]


There are 506,233 rows with missing Work Location Borough. Since the number of missing names is large, we shouldn't just get rid of them. Some people might work in many different buroughs due to the nature of their jobs. Since this is a categorical value, we can have an extra value "Unknown" for any one with missing burough information. 

In [97]:
payroll["Work Location Borough"] = payroll["Work Location Borough"].fillna("UNKNOWN")

In [120]:
payroll['Agency Name'].value_counts()

Payroll Number
742.0    776468
747.0    666254
56.0     348534
744.0    272137
300.0    243401
          ...  
478.0        18
392.0        15
381.0        14
383.0        13
130.0         3
Name: count, Length: 157, dtype: int64

##### Expense Budget

In [122]:
expense_budget["Unit Appropriation Number"].value_counts()

Unit Appropriation Number
2      163246
1      105363
6       47717
4       46612
3       34580
        ...  
211         7
634         5
834         5
210         4
491         2
Name: count, Length: 219, dtype: int64

#### Getting Gender Information

Checking overlap between baby names and payroll data

In [89]:
unique_baby = baby_names[baby_names["Child's First Name"].notnull()]
unique_payroll = payroll[payroll["First Name"].notnull()]

unique_baby = baby_names["Child's First Name"].unique()
unique_payroll = payroll["First Name"].unique()

unique_payroll = unique_payroll.astype(str)
unique_baby = unique_baby.astype(str)

unique_baby = np.char.upper(unique_baby)
unique_payroll = np.char.upper(unique_payroll)

len(np.intersect1d(unique_baby, unique_payroll))

1804

Use a gender prediction model

In [114]:
genModel = gender.Detector()

#run through model
def get_gender(name):
    return genModel.get_gender(name.capitalize())

payroll['Gender'] = payroll['First Name'].apply(get_gender)

#handle cases where the model is unsure and assign a gender
def finalize_gender(gender):
    if gender == "mostly_female":
        return "female"
    elif gender == "mostly_male":
        return "male"
    elif gender == "male" or gender == "female":
        return gender
    else:
        return random.choice(["male", "female"])
    
payroll['Gender'] = payroll['Gender'].apply(finalize_gender)
    

In [115]:
payroll['Gender'].value_counts()

Gender
female    3246234
male      2403234
Name: count, dtype: int64