In [1]:
import json
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 100)
pd.options.display.float_format = "{:,.2f}".format

## Import and format the data

Import charge data for fiscal years 2011-2017.

In [2]:
charges = pd.read_csv("data/raw/charges_11_17.txt", sep="\t", skiprows=1,
                      dtype={1: str},
                      names=["fiscal_year", "charge_num", "state", "num_employees_code",
                             "num_employees", "naics_code", "naics_desc", "type_code",
                             "type", "birth_date", "sex", "date_received", "date_closed",
                             "closure_code", "closure_action", "monetary_benefits", "statute_code",
                             "statute", "basis_code", "basis", "issue_code", "issue",
                             "court_filing_date", "civil_action_num", "court", "resolution_date",
                             "case_type", "litigation_monetary_benefits"])
charges.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3443509 entries, 0 to 3443508
Data columns (total 28 columns):
fiscal_year                     object
charge_num                      object
state                           object
num_employees_code              object
num_employees                   object
naics_code                      float64
naics_desc                      object
type_code                       object
type                            object
birth_date                      object
sex                             object
date_received                   object
date_closed                     object
closure_code                    object
closure_action                  object
monetary_benefits               float64
statute_code                    object
statute                         object
basis_code                      object
basis                           object
issue_code                      object
issue                           object
court_filing_date          

Convert the date columns.

In [3]:
charges["birth_date"] = pd.to_datetime(charges["birth_date"], errors="coerce", format="%m/%d/%Y")
charges["date_received"] = pd.to_datetime(charges["date_received"], errors="coerce", format="%m/%d/%Y")
charges["date_closed"] = pd.to_datetime(charges["date_closed"], errors="coerce", format="%m/%d/%Y")
charges["court_filing_date"] = pd.to_datetime(charges["court_filing_date"], errors="coerce", format="%m/%d/%Y")
charges["resolution_date"] = pd.to_datetime(charges["resolution_date"], errors="coerce", format="%m/%d/%Y")
charges.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3443509 entries, 0 to 3443508
Data columns (total 28 columns):
fiscal_year                     object
charge_num                      object
state                           object
num_employees_code              object
num_employees                   object
naics_code                      float64
naics_desc                      object
type_code                       object
type                            object
birth_date                      datetime64[ns]
sex                             object
date_received                   datetime64[ns]
date_closed                     datetime64[ns]
closure_code                    object
closure_action                  object
monetary_benefits               float64
statute_code                    object
statute                         object
basis_code                      object
basis                           object
issue_code                      object
issue                           object
cou

Import charge data for fiscal year 2010.

In [4]:
charges_10 = pd.read_csv("data/raw/Genre_2010.txt", sep="\t", skiprows=1,
                         dtype={0: str},
                         names=["charge_num", "state", "num_employees_code",
                                "num_employees", "naics_code", "naics_desc", "type_code",
                                "type", "birth_date", "sex", "date_received", "date_fepa_sent_to_eeoc",
                                "date_closed", "closure_code", "closure_action", "monetary_benefits",
                                "statute_code", "statute", "basis_code", "basis", "issue_code", "issue",
                                "court_filing_date", "civil_action_num", "court", "resolution_date",
                                "litigation_monetary_benefits", "case_type"])
charges_10.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343863 entries, 0 to 343862
Data columns (total 28 columns):
charge_num                      343863 non-null object
state                           343801 non-null object
num_employees_code              328035 non-null object
num_employees                   328035 non-null object
naics_code                      187222 non-null float64
naics_desc                      185282 non-null object
type_code                       343829 non-null object
type                            343829 non-null object
birth_date                      310000 non-null object
sex                             343453 non-null object
date_received                   343863 non-null object
date_fepa_sent_to_eeoc          20188 non-null object
date_closed                     230050 non-null object
closure_code                    230050 non-null object
closure_action                  230050 non-null object
monetary_benefits               37964 non-null object
statute_co

Convert the date columns.

In [5]:
charges_10["birth_date"] = pd.to_datetime(charges_10["birth_date"], errors="coerce", format="%m/%d/%y")
charges_10["date_received"] = pd.to_datetime(charges_10["date_received"], errors="coerce", format="%m/%d/%y")
charges_10["date_closed"] = pd.to_datetime(charges_10["date_closed"], errors="coerce", format="%m/%d/%y")
charges_10["court_filing_date"] = pd.to_datetime(charges_10["court_filing_date"], errors="coerce", format="%m/%d/%y")
charges_10["resolution_date"] = pd.to_datetime(charges_10["resolution_date"], errors="coerce", format="%m/%d/%y")
charges_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343863 entries, 0 to 343862
Data columns (total 28 columns):
charge_num                      343863 non-null object
state                           343801 non-null object
num_employees_code              328035 non-null object
num_employees                   328035 non-null object
naics_code                      187222 non-null float64
naics_desc                      185282 non-null object
type_code                       343829 non-null object
type                            343829 non-null object
birth_date                      310000 non-null datetime64[ns]
sex                             343453 non-null object
date_received                   343863 non-null datetime64[ns]
date_fepa_sent_to_eeoc          20188 non-null object
date_closed                     230050 non-null datetime64[ns]
closure_code                    230050 non-null object
closure_action                  230050 non-null object
monetary_benefits               37964 no

The columns in the 2010 charge data differ from those of the 2011 through 2017 data. We need to delete, add, rename and reorder the columns before concatenating the data.

In [6]:
charges_10.drop("date_fepa_sent_to_eeoc", axis=1, inplace=True)
charges_10["fiscal_year"] = "FY2010"
charges_10 = charges_10[["fiscal_year", "charge_num", "state", "num_employees_code", "num_employees", "naics_code", "naics_desc", "type_code",
                         "type", "birth_date", "sex", "date_received", "date_closed", "closure_code", "closure_action", "monetary_benefits",
                         "statute_code", "statute", "basis_code", "basis", "issue_code", "issue", "court_filing_date", "civil_action_num",
                         "court", "resolution_date", "case_type", "litigation_monetary_benefits"]]
charges_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343863 entries, 0 to 343862
Data columns (total 28 columns):
fiscal_year                     343863 non-null object
charge_num                      343863 non-null object
state                           343801 non-null object
num_employees_code              328035 non-null object
num_employees                   328035 non-null object
naics_code                      187222 non-null float64
naics_desc                      185282 non-null object
type_code                       343829 non-null object
type                            343829 non-null object
birth_date                      310000 non-null datetime64[ns]
sex                             343453 non-null object
date_received                   343863 non-null datetime64[ns]
date_closed                     230050 non-null datetime64[ns]
closure_code                    230050 non-null object
closure_action                  230050 non-null object
monetary_benefits               37964 n

Concatenate the two sets of charge data.

In [7]:
charges = pd.concat([charges, charges_10], ignore_index=True)
charges.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3787372 entries, 0 to 3787371
Data columns (total 28 columns):
fiscal_year                     object
charge_num                      object
state                           object
num_employees_code              object
num_employees                   object
naics_code                      float64
naics_desc                      object
type_code                       object
type                            object
birth_date                      datetime64[ns]
sex                             object
date_received                   datetime64[ns]
date_closed                     datetime64[ns]
closure_code                    object
closure_action                  object
monetary_benefits               object
statute_code                    object
statute                         object
basis_code                      object
basis                           object
issue_code                      object
issue                           object
cour

Convert the columns to their proper data types.

In [8]:
charges = charges.astype(dtype={"fiscal_year": "category", "charge_num": str, "state": "category",
                            "num_employees_code": "category", "num_employees": "category",
                            "naics_code": "category", "naics_desc": "category",
                            "type_code": "category", "type": "category", "sex": "category",
                            "closure_code": "category", "closure_action": "category",
                            "statute_code": "category", "statute": "category",
                            "basis_code": "category", "basis": "category", "issue_code": "category",
                            "issue": "category", "court": "category", "case_type": "category"})
charges["monetary_benefits"] = pd.to_numeric(charges["monetary_benefits"], errors="coerce", downcast="float")
charges["litigation_monetary_benefits"] = pd.to_numeric(charges["litigation_monetary_benefits"], errors="coerce", downcast="float")
charges.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3787372 entries, 0 to 3787371
Data columns (total 28 columns):
fiscal_year                     category
charge_num                      object
state                           category
num_employees_code              category
num_employees                   category
naics_code                      category
naics_desc                      category
type_code                       category
type                            category
birth_date                      datetime64[ns]
sex                             category
date_received                   datetime64[ns]
date_closed                     datetime64[ns]
closure_code                    category
closure_action                  category
monetary_benefits               float32
statute_code                    category
statute                         category
basis_code                      category
basis                           category
issue_code                      category
issue      

## How many alleged violations are we dealing with?

In [9]:
charges["charge_num"].count()

3787372

## And how many cases are we dealing with?

In [10]:
charges["charge_num"].nunique()

1056978

## In how many cases did the EEOC find merit to the complaint?

In [11]:
meritorious_outcomes = ["Case Settled By Legal Unit", "Conciliation Failure", "Successful Conciliation"]
meritorious_charges = charges[charges["closure_action"].isin(meritorious_outcomes)]
meritorious_charges["charge_num"].nunique()

17253

## And in how many cases did a worker see any monetary benefits?

In [12]:
monetary_benefits_charges = charges[charges["monetary_benefits"] > 0]

In [13]:
monetary_benefits_charges["charge_num"].nunique()

112198

In [14]:
monetary_benefits_charges.groupby(["charge_num"])["monetary_benefits"].mean().sum()

2354293800.0

## What sorts of discriminatory bases were most likely to result in an outcome where the complainant saw any monetary benefits and how does that compare with the overall number of those violations?

### Calculate the number of cases grouped by basis.

In [15]:
all_cases = charges.groupby(["charge_num", "basis"])["issue"].count().reset_index()
all_cases.rename(columns={"issue": "num_issues"}, inplace=True)
all_cases.sort_values("charge_num").head()

Unnamed: 0,charge_num,basis,num_issues
0,3288411.72,Age,1
1,3288411.72,Retaliation,2
2,3593445.5595,Retaliation,2
3,3593445.5595,Sex-Female,2
4,3848821.6365,Retaliation,1


In [16]:
all_cases_by_basis = all_cases.groupby("basis").count().reset_index()
all_cases_by_basis.rename(columns={"charge_num": "num_cases"}, inplace=True)
all_cases_by_basis.drop("num_issues", axis=1, inplace=True)
all_cases_by_basis["percent_of_cases"] = all_cases_by_basis["num_cases"] / charges["charge_num"].nunique()
all_cases_by_basis["rank"] = all_cases_by_basis["percent_of_cases"].rank(method="min", ascending=False).astype(int)
all_cases_by_basis.sort_values("num_cases", ascending=False).head(1)

Unnamed: 0,basis,num_cases,percent_of_cases,rank
74,Retaliation,412515,0.39,1


### Calculate the number of cases, grouped by basis, that resulted in an outcome where the complainant saw monetary benefits.

In [17]:
monetary_benefits_cases = monetary_benefits_charges.groupby(["charge_num", "basis"])["issue"].count().reset_index()
monetary_benefits_cases.rename(columns={"issue": "num_issues"}, inplace=True)
monetary_benefits_cases.sort_values("charge_num").head()

Unnamed: 0,charge_num,basis,num_issues
0,3097456,Retaliation,1
1,3832804,Race-Black/African American,1
2,3832804,Sex-Male,1
3,3855513,Race-Black/African American,3
4,3855513,Sex-Male,3


In [18]:
monetary_benefits_by_basis = monetary_benefits_cases.groupby("basis").count().reset_index()
monetary_benefits_by_basis.rename(columns={"charge_num": "num_cases"}, inplace=True)
monetary_benefits_by_basis.drop("num_issues", axis=1, inplace=True)
monetary_benefits_by_basis["percent_of_cases"] = monetary_benefits_by_basis["num_cases"] / monetary_benefits_charges["charge_num"].nunique()
monetary_benefits_by_basis["rank"] = monetary_benefits_by_basis["percent_of_cases"].rank(method="min", ascending=False).astype(int)
monetary_benefits_by_basis.sort_values("num_cases", ascending=False).head(1)

Unnamed: 0,basis,num_cases,percent_of_cases,rank
74,Retaliation,44311,0.39,1


### Join the tables.

In [19]:
cases_by_basis_and_benefits = all_cases_by_basis.merge(monetary_benefits_by_basis, how="left", on="basis", suffixes=["", "_monetary_benefits"])
cases_by_basis_and_benefits.sort_values("rank", ascending=True).head()

Unnamed: 0,basis,num_cases,percent_of_cases,rank,num_cases_monetary_benefits,percent_of_cases_monetary_benefits,rank_monetary_benefits
74,Retaliation,412515,0.39,1,44311,0.39,1
60,Race-Black/African American,279225,0.26,2,26514,0.24,2
0,Age,231539,0.22,3,22668,0.2,4
76,Sex-Female,203779,0.19,4,24606,0.22,3
50,Other Disability,109168,0.1,5,12473,0.11,5


In [20]:
cases_by_basis_and_benefits.to_csv("data/cases_by_basis_and_benefits.csv", index=False)