In [1]:
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 10000)

## Import and format EEOC employment discrimination data

The employment discrimination data spans 25 years. Data between 1992 and 5/20/2011 is formatted slightly differently than data from the latter part of 2011 through 2017 so we must import the text files separately.

First, let's import the 1992-2011 data.

In [2]:
complaints_92_11 = pd.read_csv("data/complaints_92_11.txt", sep="\t")
complaints_92_11.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4919234 entries, 0 to 4919233
Data columns (total 28 columns):
Unique ID                       object
State Code                      object
No of Employees Code            object
No of Employees                 object
NAICS Code                      object
NAICS Description               object
Institution Type Code           object
Institution Type                object
CP Date of Birth                object
CP Sex                          object
Date First Office               object
Date FEPA Sent to EEOC          object
Closure Date                    object
Closure Code                    object
Closure Type                    object
Monetary Benefits               object
Statute Code                    object
Statute                         object
Basis Code                      object
Basis                           object
Issue Code                      object
Issue                           object
Court Filing Date            

Now, let's import the 2011-2017 data.

In [3]:
complaints_11_17 = pd.read_csv("data/complaints_11_17.txt", sep="\t")
complaints_11_17.rename(columns={"(SELECTSUM(NVL(BACKPAY,0)+NVL(FRONTPAY,0)+NVL(INTEREST,0)+NVL(LIQUIDATED_DAMAGES,0)+NVL(NON_PEC_COMP_DAMAGES,0)+NVL(PEC_COMP_DAMAGES,0)+NVL(PUNITIVE_DAMAGES,0)+NVL(COSTS_AND_FEES,0))FROMLEG_BENEFITLBWHERELB.LEGCASE_CASE_SEQ=AL7.CASE_SEQ)":"litigation_monetary_benefits"}, inplace=True)
complaints_11_17.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3443516 entries, 0 to 3443515
Data columns (total 28 columns):
FISCAL_YEAR                     object
CHARGE_INQUIRY_SEQ              object
STATE_CODE                      object
NUMBER_OF_EMPLOYEES_CODE        object
NUMBER_OF_EMPLOYEES             object
NAICS_CODE                      object
NAICS_DESCRIPTION               object
INSTITUTION_TYPE_CODE           object
INSTITUTION_TYPE                object
DATE_OF_BIRTH                   object
SEX_CODE                        object
DATE_FIRST_OFFICE               object
CLOSURE_DATE                    object
CLOSURE_CODE                    object
CLOSURE_ACTION                  object
TOTAL_BENEFIT_AMOUNT            object
STATUTE_CODE                    object
STATUTE                         object
BASIS_CODE                      object
BASIS                           object
ISSUE_CODE                      object
ISSUE                           object
COURT_FILING_DATE            

Eliminate capital letters and spaces in column headers.

In [4]:
complaints_92_11.columns = map(str.lower, complaints_92_11.columns)
complaints_92_11.columns = complaints_92_11.columns.str.replace(" ","_")
complaints_11_17.columns = map(str.lower, complaints_11_17.columns)
complaints_11_17.columns = complaints_11_17.columns.str.replace(" ","_")

Drop repetitive header rows.

In [5]:
complaints_92_11 = complaints_92_11[complaints_92_11["unique_id"] != "Unique ID"]
complaints_11_17 = complaints_11_17[complaints_11_17["charge_inquiry_seq"] != "CHARGE_INQUIRY_SEQ"]

Convert the dataframes' columns to their appropriate data types.

In [6]:
complaints_92_11["monetary_benefits"] = pd.to_numeric(complaints_92_11["monetary_benefits"], errors="coerce", downcast='float')
complaints_92_11["litigation_monetary_benefits"] = pd.to_numeric(complaints_92_11["litigation_monetary_benefits"], errors="coerce", downcast='float')
complaints_11_17["total_benefit_amount"] = pd.to_numeric(complaints_11_17["total_benefit_amount"], errors="coerce", downcast='float')
complaints_11_17["litigation_monetary_benefits"] = pd.to_numeric(complaints_11_17["litigation_monetary_benefits"], errors="coerce", downcast='float')

In [7]:
complaints_92_11.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4919215 entries, 0 to 4919233
Data columns (total 28 columns):
unique_id                       object
state_code                      object
no_of_employees_code            object
no_of_employees                 object
naics_code                      object
naics_description               object
institution_type_code           object
institution_type                object
cp_date_of_birth                object
cp_sex                          object
date_first_office               object
date_fepa_sent_to_eeoc          object
closure_date                    object
closure_code                    object
closure_type                    object
monetary_benefits               float32
statute_code                    object
statute                         object
basis_code                      object
basis                           object
issue_code                      object
issue                           object
court_filing_date           

In [8]:
complaints_11_17.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3443510 entries, 0 to 3443515
Data columns (total 28 columns):
fiscal_year                     object
charge_inquiry_seq              object
state_code                      object
number_of_employees_code        object
number_of_employees             object
naics_code                      object
naics_description               object
institution_type_code           object
institution_type                object
date_of_birth                   object
sex_code                        object
date_first_office               object
closure_date                    object
closure_code                    object
closure_action                  object
total_benefit_amount            float32
statute_code                    object
statute                         object
basis_code                      object
basis                           object
issue_code                      object
issue                           object
court_filing_date           

We'll want to run groupby queries in our analysis. Python ignores NaN values in such queries so we need to fill in those values with a placeholder.

In [9]:
complaints_92_11.fillna(0, inplace=True)
complaints_92_11.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4919215 entries, 0 to 4919233
Data columns (total 28 columns):
unique_id                       object
state_code                      object
no_of_employees_code            object
no_of_employees                 object
naics_code                      object
naics_description               object
institution_type_code           object
institution_type                object
cp_date_of_birth                object
cp_sex                          object
date_first_office               object
date_fepa_sent_to_eeoc          object
closure_date                    object
closure_code                    object
closure_type                    object
monetary_benefits               float32
statute_code                    object
statute                         object
basis_code                      object
basis                           object
issue_code                      object
issue                           object
court_filing_date           

In [10]:
complaints_11_17.fillna(0, inplace=True)
complaints_11_17.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3443510 entries, 0 to 3443515
Data columns (total 28 columns):
fiscal_year                     object
charge_inquiry_seq              object
state_code                      object
number_of_employees_code        object
number_of_employees             object
naics_code                      object
naics_description               object
institution_type_code           object
institution_type                object
date_of_birth                   object
sex_code                        object
date_first_office               object
closure_date                    object
closure_code                    object
closure_action                  object
total_benefit_amount            float32
statute_code                    object
statute                         object
basis_code                      object
basis                           object
issue_code                      object
issue                           object
court_filing_date           

## How many charges and cases are filed by year?

Now, let's analyze the data. We can start by seeing how many charges are filed by year.

In [11]:
charges_by_year_11_17 = complaints_11_17.groupby("fiscal_year").size().reset_index(name="charges")
charges_by_year_11_17

Unnamed: 0,fiscal_year,charges
0,0,3
1,,1
2,FY2011,480833
3,FY2012,485451
4,FY2013,486186
5,FY2014,496966
6,FY2015,491092
7,FY2016,514604
8,FY2017,488374


How does that compare with the earlier years of data? Because this set of data doesn't have a handy fiscal year column, we need to convert the "date_first_office" column to datetime, extract the year and group on that.

In [12]:
complaints_92_11["date_first_office"] = pd.to_datetime(complaints_92_11["date_first_office"], errors="coerce", infer_datetime_format=True)
complaints_92_11["year"] = complaints_92_11["date_first_office"].dt.year
charges_by_year_92_11 = complaints_92_11.groupby("year").size().reset_index(name="charges")

And how many cases are filed by year?

In [13]:
cases_by_year_11_17 = complaints_11_17.groupby("fiscal_year").agg({"charge_inquiry_seq": pd.Series.nunique})
cases_by_year_11_17

Unnamed: 0_level_0,charge_inquiry_seq
fiscal_year,Unnamed: 1_level_1
0,1
,1
FY2011,163274
FY2012,166765
FY2013,160906
FY2014,161383
FY2015,155320
FY2016,150822
FY2017,173986


And what about for the earlier years of data?

In [14]:
cases_by_year_92_11 = complaints_92_11.groupby("year").agg({"unique_id": pd.Series.nunique})
cases_by_year_92_11

Unnamed: 0_level_0,unique_id
year,Unnamed: 1_level_1
1970,1
1991,17719
1992,78825
1993,93383
1994,99302
1995,88865
1996,89468
1997,91382
1998,86738
1999,86215


## How are different types of cases resolved?

Next, we'd like to know how different types of cases ("bases") are resolved. To do so, we'll need to group by the basis and closure action.

In [15]:
resolution_by_basis_11_17 = pd.pivot_table(complaints_11_17, index=["basis", "closure_action"], values="charge_inquiry_seq", aggfunc=len)
resolution_by_basis_92_11 = pd.pivot_table(complaints_92_11, index=["basis", "closure_type"], values="unique_id", aggfunc=len)
resolution_by_basis_11_17.rename(columns={"charge_inquiry_seq": "charges_11_17"}, inplace=True)
resolution_by_basis_92_11.rename(columns={"unique_id": "charges_92_11"}, inplace=True)

In [16]:
resolution_by_basis_11_17.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,charges_11_17
basis,closure_action,Unnamed: 2_level_1
0,0,4
Age,0,11210
Age,ADEA Sect. 7(D) Closure,1134
Age,Administrative Closure,2799
Age,CP Failed To Cooperate,623


In [17]:
resolution_by_basis_92_11.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,charges_92_11
basis,closure_type,Unnamed: 2_level_1
0,0,4
0,Conciliation Failure,4
0,NRTS Issued At CP Request,25
0,No Cause Finding Issued,1
0,Settlement With Benefits,9


Now we'll reset the indexes, calculate the total number of charges per basis and calculate the percentage of each basis that receives a partcular closure action.

In [18]:
resolution_by_basis_11_17.reset_index(inplace=True)
resolution_by_basis_11_17["total_by_basis"] = resolution_by_basis_11_17["charges_11_17"].groupby(resolution_by_basis_11_17["basis"]).transform(sum)
resolution_by_basis_11_17["percent_of_charges"] = resolution_by_basis_11_17["charges_11_17"] / resolution_by_basis_11_17["total_by_basis"]
resolution_by_basis_11_17["rank"] = resolution_by_basis_11_17.groupby(["basis"])["percent_of_charges"].rank(method="min", ascending=False).astype(int)
resolution_by_basis_92_11.reset_index(inplace=True)
resolution_by_basis_92_11["total_by_basis"] = resolution_by_basis_92_11["charges_92_11"].groupby(resolution_by_basis_92_11["basis"]).transform(sum)
resolution_by_basis_92_11["percent_of_charges"] = resolution_by_basis_92_11["charges_92_11"] / resolution_by_basis_92_11["total_by_basis"]
resolution_by_basis_92_11["rank"] = resolution_by_basis_92_11.groupby(["basis"])["percent_of_charges"].rank(method="min", ascending=False).astype(int)

In [19]:
resolution_by_basis_11_17.head(25)

Unnamed: 0,basis,closure_action,charges_11_17,total_by_basis,percent_of_charges,rank
0,0,0,4,4,1.0,1
1,Age,0,11210,346135,0.032386,6
2,Age,ADEA Sect. 7(D) Closure,1134,346135,0.003276,13
3,Age,Administrative Closure,2799,346135,0.008086,10
4,Age,CP Failed To Cooperate,623,346135,0.0018,14
5,Age,CP Failed To Respond To 30-Day Letter,422,346135,0.001219,15
6,Age,CP Filed Suit,3273,346135,0.009456,9
7,Age,CP Refused Full Relief,20,346135,5.8e-05,22
8,Age,CP Withdrawal - No Ben.,9463,346135,0.027339,7
9,Age,Case Settled By Legal Unit,44,346135,0.000127,18


In [20]:
resolution_by_basis_92_11.head(25)

Unnamed: 0,basis,closure_type,charges_92_11,total_by_basis,percent_of_charges,rank
0,0,0,4,43,0.093023,3
1,0,Conciliation Failure,4,43,0.093023,3
2,0,NRTS Issued At CP Request,25,43,0.581395,1
3,0,No Cause Finding Issued,1,43,0.023256,5
4,0,Settlement With Benefits,9,43,0.209302,2
5,Abortion,NRTS Issued At CP Request,2,2,1.0,1
6,Age,0,32434,573302,0.056574,4
7,Age,ADEA Sect. 7(D) Closure,10740,573302,0.018734,9
8,Age,Administrative Closure,8940,573302,0.015594,10
9,Age,CP Failed To Cooperate,2460,573302,0.004291,13


How do the bases of complaints differ, if at all, based on the complainants gender?

In [31]:
female_complaints_11_17 = complaints_11_17.loc[complaints_11_17["sex_code"] == "F", ["charge_inquiry_seq"]]
female_complaints_11_17.head()

Unnamed: 0,charge_inquiry_seq
1,4423978
2,4554737
4,4537825
8,4493406
10,4583038


In [None]:
#female_complaints_92_11 = complaints_92_11.loc[complaints_92_11["cp_sex"] == "F", ["unique_id"]]
#women_charges_92_11 = complaints_92_11.merge(female_complaints_92_11, on="unique_id", how="inner")

In [30]:
women_charges_11_17 = complaints_11_17.merge(female_complaints_11_17, on="charge_inquiry_seq", how="inner")

Unnamed: 0,charge_inquiry_seq
1,4423978
2,4554737
4,4537825
8,4493406
10,4583038


Now we'll export the data to Excel for further analysis.

In [131]:
writer = pd.ExcelWriter("data/employment_discrimination.xlsx")
resolution_by_basis_11_17.to_excel(writer, "resolution_by_basis_11_17", startcol=0, index=False)
resolution_by_basis_92_11.to_excel(writer, "resolution_by_basis_92_11", startcol=0, index=False)
writer.save()

How are charges resolved by state?

In [16]:
resolution_by_state_11_17 = pd.pivot_table(complaints_11_17, index=["state_code", "closure_action"], values="charge_inquiry_seq", aggfunc=len)
resolution_by_state_92_11 = pd.pivot_table(complaints_92_11, index=["state_code", "closure_type"], values="unique_id", aggfunc=len)
resolution_by_state_11_17.rename(columns={"charge_inquiry_seq": "charges_11_17"}, inplace=True)
resolution_by_state_92_11.rename(columns={"unique_id": "charges_92_11"}, inplace=True)

Save our query outputs to Excel.

In [18]:
writer = pd.ExcelWriter("data/employment_discrimination.xlsx")
resolution_by_basis_11_17.to_excel(writer, "resolution_by_basis", startcol=0, index=True)
resolution_by_basis_92_11.to_excel(writer, "resolution_by_basis", startcol=3, index=True)
resolution_by_state_11_17.to_excel(writer, "resolution_by_state", startcol=0, index=True)
resolution_by_state_92_11.to_excel(writer, "resolution_by_state", startcol=3, index=True)
writer.save()

What's going on with legislative bodies?

In [37]:
legislative_bodies_11_17 = complaints_11_17[complaints_11_17["naics_description"].str.contains("legislative", case=False, na=False)]
legislative_bodies_11_17.to_excel("data/legislative_bodies_11_17.xlsx")

In [46]:
legislative_bodies_11_17.groupby("state_code")["state_code"].count().sort_values(ascending=False)

state_code
OH    545
VA    527
NC    437
LA    432
CO    427
FL    426
AZ    344
CA    267
NY    262
TX    205
AR    200
MI    191
PA    182
IL    180
WA    162
MO    145
KS    130
OK    126
IA    115
NJ    106
MD    100
OR     95
AK     91
NM     76
MN     71
GA     66
TN     62
VT     51
MA     51
IN     48
NV     39
SC     37
WV     37
DC     37
CT     35
ME     32
KY     28
NE     26
NH     20
AL     19
MS     19
WI     19
WY     17
ND     14
DE     13
PR     13
UT     13
HI     11
RI      9
SD      2
Name: state_code, dtype: int64