In [1]:
from functools import reduce
import json
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 100)
pd.options.display.float_format = "{:,.2f}".format

## Import and format the data

Concatenate the data into a single file.

In [2]:
!sh process_eeoc.sh

Import charge data for fiscal years 2011-2017.

In [3]:
charges = pd.read_csv("data/raw/charges_11_17.txt", sep="\t", skiprows=1,
                      dtype={1: str},
                      names=["fiscal_year", "charge_num", "state", "num_employees_code",
                             "num_employees", "naics_code", "naics_desc", "type_code",
                             "type", "birth_date", "sex", "date_received", "date_closed",
                             "closure_code", "closure_action", "monetary_benefits", "statute_code",
                             "statute", "basis_code", "basis", "issue_code", "issue",
                             "court_filing_date", "civil_action_num", "court", "resolution_date",
                             "case_type", "litigation_monetary_benefits"])
charges.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3443509 entries, 0 to 3443508
Data columns (total 28 columns):
fiscal_year                     object
charge_num                      object
state                           object
num_employees_code              object
num_employees                   object
naics_code                      float64
naics_desc                      object
type_code                       object
type                            object
birth_date                      object
sex                             object
date_received                   object
date_closed                     object
closure_code                    object
closure_action                  object
monetary_benefits               float64
statute_code                    object
statute                         object
basis_code                      object
basis                           object
issue_code                      object
issue                           object
court_filing_date          

Convert the date columns.

In [4]:
charges["birth_date"] = pd.to_datetime(charges["birth_date"], errors="coerce", format="%m/%d/%Y")
charges["date_received"] = pd.to_datetime(charges["date_received"], errors="coerce", format="%m/%d/%Y")
charges["date_closed"] = pd.to_datetime(charges["date_closed"], errors="coerce", format="%m/%d/%Y")
charges["court_filing_date"] = pd.to_datetime(charges["court_filing_date"], errors="coerce", format="%m/%d/%Y")
charges["resolution_date"] = pd.to_datetime(charges["resolution_date"], errors="coerce", format="%m/%d/%Y")
charges.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3443509 entries, 0 to 3443508
Data columns (total 28 columns):
fiscal_year                     object
charge_num                      object
state                           object
num_employees_code              object
num_employees                   object
naics_code                      float64
naics_desc                      object
type_code                       object
type                            object
birth_date                      datetime64[ns]
sex                             object
date_received                   datetime64[ns]
date_closed                     datetime64[ns]
closure_code                    object
closure_action                  object
monetary_benefits               float64
statute_code                    object
statute                         object
basis_code                      object
basis                           object
issue_code                      object
issue                           object
cou

Import charge data for fiscal year 2010.

In [5]:
charges_10 = pd.read_csv("data/raw/Genre_2010.txt", sep="\t", skiprows=1,
                         dtype={0: str},
                         names=["charge_num", "state", "num_employees_code",
                                "num_employees", "naics_code", "naics_desc", "type_code",
                                "type", "birth_date", "sex", "date_received", "date_fepa_sent_to_eeoc",
                                "date_closed", "closure_code", "closure_action", "monetary_benefits",
                                "statute_code", "statute", "basis_code", "basis", "issue_code", "issue",
                                "court_filing_date", "civil_action_num", "court", "resolution_date",
                                "litigation_monetary_benefits", "case_type"])
charges_10.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343863 entries, 0 to 343862
Data columns (total 28 columns):
charge_num                      343863 non-null object
state                           343801 non-null object
num_employees_code              328035 non-null object
num_employees                   328035 non-null object
naics_code                      187222 non-null float64
naics_desc                      185282 non-null object
type_code                       343829 non-null object
type                            343829 non-null object
birth_date                      310000 non-null object
sex                             343453 non-null object
date_received                   343863 non-null object
date_fepa_sent_to_eeoc          20188 non-null object
date_closed                     230050 non-null object
closure_code                    230050 non-null object
closure_action                  230050 non-null object
monetary_benefits               37964 non-null object
statute_co

Convert the date columns.

In [6]:
charges_10["birth_date"] = pd.to_datetime(charges_10["birth_date"], errors="coerce", format="%m/%d/%y")
charges_10["date_received"] = pd.to_datetime(charges_10["date_received"], errors="coerce", format="%m/%d/%y")
charges_10["date_closed"] = pd.to_datetime(charges_10["date_closed"], errors="coerce", format="%m/%d/%y")
charges_10["court_filing_date"] = pd.to_datetime(charges_10["court_filing_date"], errors="coerce", format="%m/%d/%y")
charges_10["resolution_date"] = pd.to_datetime(charges_10["resolution_date"], errors="coerce", format="%m/%d/%y")
charges_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343863 entries, 0 to 343862
Data columns (total 28 columns):
charge_num                      343863 non-null object
state                           343801 non-null object
num_employees_code              328035 non-null object
num_employees                   328035 non-null object
naics_code                      187222 non-null float64
naics_desc                      185282 non-null object
type_code                       343829 non-null object
type                            343829 non-null object
birth_date                      310000 non-null datetime64[ns]
sex                             343453 non-null object
date_received                   343863 non-null datetime64[ns]
date_fepa_sent_to_eeoc          20188 non-null object
date_closed                     230050 non-null datetime64[ns]
closure_code                    230050 non-null object
closure_action                  230050 non-null object
monetary_benefits               37964 no

The columns in the 2010 charge data differ from those of the 2011 through 2017 data. We need to delete, add, rename and reorder the columns before concatenating the data.

In [7]:
charges_10.drop("date_fepa_sent_to_eeoc", axis=1, inplace=True)
charges_10["fiscal_year"] = "FY2010"
charges_10 = charges_10[["fiscal_year", "charge_num", "state", "num_employees_code", "num_employees", "naics_code", "naics_desc", "type_code",
                         "type", "birth_date", "sex", "date_received", "date_closed", "closure_code", "closure_action", "monetary_benefits",
                         "statute_code", "statute", "basis_code", "basis", "issue_code", "issue", "court_filing_date", "civil_action_num",
                         "court", "resolution_date", "case_type", "litigation_monetary_benefits"]]
charges_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343863 entries, 0 to 343862
Data columns (total 28 columns):
fiscal_year                     343863 non-null object
charge_num                      343863 non-null object
state                           343801 non-null object
num_employees_code              328035 non-null object
num_employees                   328035 non-null object
naics_code                      187222 non-null float64
naics_desc                      185282 non-null object
type_code                       343829 non-null object
type                            343829 non-null object
birth_date                      310000 non-null datetime64[ns]
sex                             343453 non-null object
date_received                   343863 non-null datetime64[ns]
date_closed                     230050 non-null datetime64[ns]
closure_code                    230050 non-null object
closure_action                  230050 non-null object
monetary_benefits               37964 n

Concatenate the two sets of charge data.

In [8]:
charges = pd.concat([charges, charges_10], ignore_index=True)
charges.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3787372 entries, 0 to 3787371
Data columns (total 28 columns):
fiscal_year                     object
charge_num                      object
state                           object
num_employees_code              object
num_employees                   object
naics_code                      float64
naics_desc                      object
type_code                       object
type                            object
birth_date                      datetime64[ns]
sex                             object
date_received                   datetime64[ns]
date_closed                     datetime64[ns]
closure_code                    object
closure_action                  object
monetary_benefits               object
statute_code                    object
statute                         object
basis_code                      object
basis                           object
issue_code                      object
issue                           object
cour

Convert the columns to their proper data types.

In [9]:
charges = charges.astype(dtype={"fiscal_year": "category", "charge_num": str, "state": "category",
                            "num_employees_code": "category", "num_employees": "category",
                            "naics_code": "category", "naics_desc": "category",
                            "type_code": "category", "type": "category", "sex": "category",
                            "closure_code": "category", "closure_action": "category",
                            "statute_code": "category", "statute": "category",
                            "basis_code": "category", "basis": "category", "issue_code": "category",
                            "issue": "category", "court": "category", "case_type": "category"})
charges["monetary_benefits"] = pd.to_numeric(charges["monetary_benefits"], errors="coerce", downcast="float")
charges["litigation_monetary_benefits"] = pd.to_numeric(charges["litigation_monetary_benefits"], errors="coerce", downcast="float")
charges.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3787372 entries, 0 to 3787371
Data columns (total 28 columns):
fiscal_year                     category
charge_num                      object
state                           category
num_employees_code              category
num_employees                   category
naics_code                      category
naics_desc                      category
type_code                       category
type                            category
birth_date                      datetime64[ns]
sex                             category
date_received                   datetime64[ns]
date_closed                     datetime64[ns]
closure_code                    category
closure_action                  category
monetary_benefits               float32
statute_code                    category
statute                         category
basis_code                      category
basis                           category
issue_code                      category
issue      

## How many alleged violations are we dealing with?

In [10]:
charges["charge_num"].count()

3787372

## And how many cases are we dealing with?

In [11]:
charges["charge_num"].nunique()

1056978

How many of these cases occurred in each year?

In [12]:
all_cases_by_year = charges.groupby("fiscal_year")["charge_num"].nunique().reset_index()
all_cases_by_year.rename(columns={"charge_num": "all_cases"}, inplace=True)
all_cases_by_year

Unnamed: 0,fiscal_year,all_cases
0,FY2010,104514
1,FY2011,146123
2,FY2012,146294
3,FY2013,139852
4,FY2014,134106
5,FY2015,130871
6,FY2016,132519
7,FY2017,122824


How does this break down by basis?

In [13]:
all_cases_by_basis = charges.groupby(["basis"])["charge_num"].nunique().reset_index()
all_cases_by_basis.rename(columns={"charge_num": "all_cases"}, inplace=True)
all_cases_by_basis.sort_values("all_cases", ascending=False).head()

Unnamed: 0,basis,all_cases
74,Retaliation,412515
60,Race-Black/African American,279225
0,Age,231539
76,Sex-Female,203779
50,Other Disability,109168


## In how many cases did the EEOC find merit to the complaint?

In [14]:
meritorious_outcomes = ["Case Settled By Legal Unit", "Conciliation Failure", "Successful Conciliation"]
meritorious_cases = charges[charges["closure_action"].isin(meritorious_outcomes)]
meritorious_cases["charge_num"].nunique()

17253

How many of these cases occurred in each year?

In [15]:
meritorious_cases_by_year = meritorious_cases.groupby("fiscal_year")["charge_num"].nunique().reset_index()
meritorious_cases_by_year.rename(columns={"charge_num": "meritorious_cases"}, inplace=True)
meritorious_cases_by_year

Unnamed: 0,fiscal_year,meritorious_cases
0,FY2010,885
1,FY2011,4369
2,FY2012,3411
3,FY2013,2888
4,FY2014,2414
5,FY2015,1478
6,FY2016,1482
7,FY2017,327


How does this break down by basis?

In [16]:
meritorious_cases_by_basis = meritorious_cases.groupby(["basis"])["charge_num"].nunique().reset_index()
meritorious_cases_by_basis.rename(columns={"charge_num": "meritorious_cases"}, inplace=True)
meritorious_cases_by_basis.sort_values("meritorious_cases", ascending=False).head()

Unnamed: 0,basis,meritorious_cases
74,Retaliation,6700
76,Sex-Female,3840
60,Race-Black/African American,3095
0,Age,2989
50,Other Disability,2188


## And in how many cases did the EEOC grant some form of relief (including non-monetary relief) to the complainant?

In [17]:
relief_outcomes = ["Settlement With Benefits", "Settlement With Benefits", "Successful Conciliation", "Case Settled By Legal Unit"]
relief_cases = charges[charges["closure_action"].isin(relief_outcomes)]
relief_cases["charge_num"].nunique()

89577

How many of these cases occurred in each year?

In [18]:
relief_cases_by_year = relief_cases.groupby("fiscal_year")["charge_num"].nunique().reset_index()
relief_cases_by_year.rename(columns={"charge_num": "relief_cases"}, inplace=True)
relief_cases_by_year

Unnamed: 0,fiscal_year,relief_cases
0,FY2010,8988
1,FY2011,15442
2,FY2012,14164
3,FY2013,12912
4,FY2014,11762
5,FY2015,10571
6,FY2016,9785
7,FY2017,5953


How does this break down by basis?

In [19]:
relief_cases_by_basis = relief_cases.groupby("basis")["charge_num"].nunique().reset_index()
relief_cases_by_basis.rename(columns={"charge_num": "relief_cases"}, inplace=True)
relief_cases_by_basis.sort_values("relief_cases", ascending=False).head()

Unnamed: 0,basis,relief_cases
74,Retaliation,32550
60,Race-Black/African American,21067
76,Sex-Female,18731
0,Age,17551
50,Other Disability,10079


## And in how many cases did a worker see any monetary benefits?

In [20]:
monetary_benefits_cases = charges[charges["monetary_benefits"] > 0]
monetary_benefits_cases["charge_num"].nunique()

112198

How much money did they get?

In [21]:
monetary_benefits_cases.groupby(["charge_num"])["monetary_benefits"].mean().sum()

2354293800.0

How many of these cases occurred in each year?

In [22]:
monetary_benefits_cases_by_year = monetary_benefits_cases.groupby("fiscal_year")["charge_num"].nunique().reset_index()
monetary_benefits_cases_by_year.rename(columns={"charge_num": "monetary_benefits_cases"}, inplace=True)
monetary_benefits_cases_by_year

Unnamed: 0,fiscal_year,monetary_benefits_cases
0,FY2010,1008
1,FY2011,19415
2,FY2012,18256
3,FY2013,17455
4,FY2014,16445
5,FY2015,15325
6,FY2016,14883
7,FY2017,9411


How does this break down by basis?

In [23]:
monetary_benefits_cases_by_basis = monetary_benefits_cases.groupby("basis")["charge_num"].nunique().reset_index()
monetary_benefits_cases_by_basis.rename(columns={"charge_num": "monetary_benefits_cases"}, inplace=True)
monetary_benefits_cases_by_basis.sort_values("monetary_benefits_cases", ascending=False).head()

Unnamed: 0,basis,monetary_benefits_cases
74,Retaliation,44311
60,Race-Black/African American,26514
76,Sex-Female,24606
0,Age,22668
50,Other Disability,12473


## Combine the data by year.

In [24]:
cases_by_year_dfs = [all_cases_by_year, meritorious_cases_by_year, relief_cases_by_year, monetary_benefits_cases_by_year]
cases_by_year = reduce(lambda left, right: pd.merge(left, right, on="fiscal_year"), cases_by_year_dfs)
cases_by_year

Unnamed: 0,fiscal_year,all_cases,meritorious_cases,relief_cases,monetary_benefits_cases
0,FY2010,104514,885,8988,1008
1,FY2011,146123,4369,15442,19415
2,FY2012,146294,3411,14164,18256
3,FY2013,139852,2888,12912,17455
4,FY2014,134106,2414,11762,16445
5,FY2015,130871,1478,10571,15325
6,FY2016,132519,1482,9785,14883
7,FY2017,122824,327,5953,9411


Calculate the proportion of all cases that fall into each category each fiscal year.

In [25]:
cases_by_year["pct_all_cases_meritorious"] = cases_by_year["meritorious_cases"] / cases_by_year["all_cases"]
cases_by_year["pct_all_cases_relief"] = cases_by_year["relief_cases"] / cases_by_year["all_cases"]
cases_by_year["pct_all_cases_monetary_benefits"] = cases_by_year["monetary_benefits_cases"] / cases_by_year["all_cases"]
cases_by_year = cases_by_year[["fiscal_year", "all_cases", "meritorious_cases", "pct_all_cases_meritorious", "relief_cases", "pct_all_cases_relief", "monetary_benefits_cases", "pct_all_cases_monetary_benefits"]]
cases_by_year

Unnamed: 0,fiscal_year,all_cases,meritorious_cases,pct_all_cases_meritorious,relief_cases,pct_all_cases_relief,monetary_benefits_cases,pct_all_cases_monetary_benefits
0,FY2010,104514,885,0.01,8988,0.09,1008,0.01
1,FY2011,146123,4369,0.03,15442,0.11,19415,0.13
2,FY2012,146294,3411,0.02,14164,0.1,18256,0.12
3,FY2013,139852,2888,0.02,12912,0.09,17455,0.12
4,FY2014,134106,2414,0.02,11762,0.09,16445,0.12
5,FY2015,130871,1478,0.01,10571,0.08,15325,0.12
6,FY2016,132519,1482,0.01,9785,0.07,14883,0.11
7,FY2017,122824,327,0.0,5953,0.05,9411,0.08


## Combine the data by basis.

In [26]:
cases_by_basis_dfs = [all_cases_by_basis, meritorious_cases_by_basis, relief_cases_by_basis, monetary_benefits_cases_by_basis]
cases_by_basis = reduce(lambda left, right: pd.merge(left, right, on="basis"), cases_by_basis_dfs)
cases_by_basis.sort_values("all_cases", ascending=False).head()

Unnamed: 0,basis,all_cases,meritorious_cases,relief_cases,monetary_benefits_cases
74,Retaliation,412515,6700,32550,44311
60,Race-Black/African American,279225,3095,21067,26514
0,Age,231539,2989,17551,22668
76,Sex-Female,203779,3840,18731,24606
50,Other Disability,109168,2188,10079,12473


Calculate the proportion of all cases that fall into each category by basis.

In [27]:
cases_by_basis["pct_all_cases_meritorious"] = cases_by_basis["meritorious_cases"] / cases_by_basis["all_cases"]
cases_by_basis["pct_all_cases_relief"] = cases_by_basis["relief_cases"] / cases_by_basis["all_cases"]
cases_by_basis["pct_all_cases_monetary_benefits"] = cases_by_basis["monetary_benefits_cases"] / cases_by_basis["all_cases"]
cases_by_basis = cases_by_basis[["basis", "all_cases", "meritorious_cases", "pct_all_cases_meritorious", "relief_cases", "pct_all_cases_relief", "monetary_benefits_cases", "pct_all_cases_monetary_benefits"]]
cases_by_basis.sort_values("all_cases", ascending=False).head()

Unnamed: 0,basis,all_cases,meritorious_cases,pct_all_cases_meritorious,relief_cases,pct_all_cases_relief,monetary_benefits_cases,pct_all_cases_monetary_benefits
74,Retaliation,412515,6700,0.02,32550,0.08,44311,0.11
60,Race-Black/African American,279225,3095,0.01,21067,0.08,26514,0.09
0,Age,231539,2989,0.01,17551,0.08,22668,0.1
76,Sex-Female,203779,3840,0.02,18731,0.09,24606,0.12
50,Other Disability,109168,2188,0.02,10079,0.09,12473,0.11


## What sorts of discriminatory bases were most likely to result in an outcome where the EEOC found merit in the complaint, the complainant got some form of relief and in which the complainant saw any monetary benefits and how does that compare with the overall number of those violations (minimum 100 cases)?

In [28]:
cases_by_basis[cases_by_basis["all_cases"] >= 100].sort_values("pct_all_cases_meritorious", ascending=False).head()

Unnamed: 0,basis,all_cases,meritorious_cases,pct_all_cases_meritorious,relief_cases,pct_all_cases_relief,monetary_benefits_cases,pct_all_cases_monetary_benefits
28,Genetic Information Discrimination,1853,101,0.05,158,0.09,160,0.09
67,Religion-7th Day Adventist,1205,62,0.05,152,0.13,138,0.11
48,Other,22499,928,0.04,2473,0.11,1743,0.08
21,Drug Addiction,1365,54,0.04,127,0.09,157,0.12
77,Sex-Gender Identity/Transgender,2082,78,0.04,172,0.08,213,0.1


In [29]:
cases_by_basis[cases_by_basis["all_cases"] >= 100].sort_values("pct_all_cases_relief", ascending=False).head()

Unnamed: 0,basis,all_cases,meritorious_cases,pct_all_cases_meritorious,relief_cases,pct_all_cases_relief,monetary_benefits_cases,pct_all_cases_monetary_benefits
39,Missing Digits/Limbs,2360,34,0.01,320,0.14,339,0.14
17,Cystic Fibrosis,101,2,0.02,13,0.13,18,0.18
12,Cerebral Palsy,807,25,0.03,102,0.13,111,0.14
67,Religion-7th Day Adventist,1205,62,0.05,152,0.13,138,0.11
79,Sex-Pregnancy,41019,944,0.02,4977,0.12,6227,0.15


In [30]:
cases_by_basis[cases_by_basis["all_cases"] >= 100].sort_values("pct_all_cases_monetary_benefits", ascending=False).head()

Unnamed: 0,basis,all_cases,meritorious_cases,pct_all_cases_meritorious,relief_cases,pct_all_cases_relief,monetary_benefits_cases,pct_all_cases_monetary_benefits
17,Cystic Fibrosis,101,2,0.02,13,0.13,18,0.18
79,Sex-Pregnancy,41019,944,0.02,4977,0.12,6227,0.15
11,Cancer,9749,284,0.03,1103,0.11,1422,0.15
65,Regarded As Disabled,30638,923,0.03,3253,0.11,4431,0.14
39,Missing Digits/Limbs,2360,34,0.01,320,0.14,339,0.14


Export the tables.

In [31]:
cases_by_year.to_csv("data/cases_by_year.csv", index=False)
cases_by_basis.to_csv("data/cases_by_basis.csv", index=False)