In [1]:
import pandas as pd
pd.set_option("display.max_columns", 50)

## Format 2000-2010 data

Concatenate 2000 through Feb. 2010 complainant data from the EEOC.

In [2]:
!sh process_eeoc.sh

Import complainant data.

In [3]:
complainants = pd.read_csv("data/raw/complainants_00_09.txt", sep="\t", skiprows=1,
                           names=["complainant_id", "date_received", "date_closed",
                                  "receiving_office", "accountability_office", "eeoc_or_fepa",
                                  "processing_category", "statute", "basis", "issue", "city",
                                  "county", "state", "naics", "num_employees", "type", "race",
                                  "sex", "nat_origin", "birth_date"],
                           dtype={"receiving_office": "category", "accountability_office": "category",
                                  "eeoc_or_fepa": "category","processing_category": "category", "statute": "category",
                                  "basis": "category", "issue": "category","city": "category", "county": "category",
                                  "state": "category", "naics": "category", "num_employees": "category",
                                  "type": "category", "race": "category", "sex": "category", "nat_origin": "category"})
complainants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 925611 entries, 0 to 925610
Data columns (total 20 columns):
complainant_id           925611 non-null object
date_received            925611 non-null object
date_closed              836105 non-null object
receiving_office         925611 non-null category
accountability_office    925611 non-null category
eeoc_or_fepa             925611 non-null category
processing_category      921323 non-null category
statute                  925530 non-null category
basis                    925274 non-null category
issue                    925573 non-null category
city                     925536 non-null category
county                   913908 non-null category
state                    925356 non-null category
naics                    620933 non-null category
num_employees            900728 non-null category
type                     925463 non-null category
race                     838595 non-null category
sex                      925236 non-null cate

Convert the date columns.

In [4]:
complainants["date_received"] = pd.to_datetime(complainants["date_received"], errors="coerce", format="%m/%d/%y")
complainants["date_closed"] = pd.to_datetime(complainants["date_closed"], errors="coerce", format="%m/%d/%y")
complainants["birth_date"] = pd.to_datetime(complainants["birth_date"], errors="coerce", format="%m/%d/%y")
complainants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 925611 entries, 0 to 925610
Data columns (total 20 columns):
complainant_id           925611 non-null object
date_received            925611 non-null datetime64[ns]
date_closed              836105 non-null datetime64[ns]
receiving_office         925611 non-null category
accountability_office    925611 non-null category
eeoc_or_fepa             925611 non-null category
processing_category      921323 non-null category
statute                  925530 non-null category
basis                    925274 non-null category
issue                    925573 non-null category
city                     925536 non-null category
county                   913908 non-null category
state                    925356 non-null category
naics                    620933 non-null category
num_employees            900728 non-null category
type                     925463 non-null category
race                     838595 non-null category
sex                      9252

Because the complainant data only contains records through the end of February 2010, we will remove records with a date_received after December 31, 2009 in favor of 2010 charge data.

In [5]:
complainants.sort_values("date_received", ascending=False).head(1)

Unnamed: 0,complainant_id,date_received,date_closed,receiving_office,accountability_office,eeoc_or_fepa,processing_category,statute,basis,issue,city,county,state,naics,num_employees,type,race,sex,nat_origin,birth_date
81864,6184056.122,2010-02-28,NaT,Intake Information Group,Phoenix District Office,EEOC,A,A,/OA,/D2/S5,Salt Lake City,Salt Lake,UT,,501+ Employees,Private Employer,American Indian or Alaska Native,F,American (U.S.) National Origin,2055-11-29


## Format 2010-2017 data

Import charge data for fiscal year 2011-2017.

In [6]:
charges = pd.read_csv("data/raw/charges_11_17.txt", sep="\t", skiprows=1,
                      names=["fiscal_year", "charge_num", "state", "num_employees_code",
                             "num_employees", "naics_code", "naics_desc", "type_code",
                             "type", "birth_date", "sex", "date_received", "date_closed",
                             "closure_code", "closure_action", "monetary_benefits", "statute_code",
                             "statute", "basis_code", "basis", "issue_code", "issue",
                             "court_filing_date", "civil_action_num", "court", "resolution_date",
                             "case_type", "litigation_monetary_benefits"],
                      dtype={"fiscal_year": "category", "state": "category",
                            "num_employees_code": "category", "num_employees": "category",
                            "naics_code": "category", "naics_desc": "category",
                            "type_code": "category", "type": "category", "sex": "category",
                            "closure_code": "category", "closure_action": "category",
                            "statute_code": "category", "statute": "category",
                            "basis_code": "category", "basis": "category", "issue_code": "category",
                            "issue": "category", "court": "category", "case_type": "category"})
charges.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3443509 entries, 0 to 3443508
Data columns (total 28 columns):
fiscal_year                     category
charge_num                      object
state                           category
num_employees_code              category
num_employees                   category
naics_code                      category
naics_desc                      category
type_code                       category
type                            category
birth_date                      object
sex                             category
date_received                   object
date_closed                     object
closure_code                    category
closure_action                  category
monetary_benefits               float64
statute_code                    category
statute                         category
basis_code                      category
basis                           category
issue_code                      category
issue                           cat

Import charge data for fiscal year 2010.

In [7]:
charges_10 = pd.read_csv("data/raw/Genre_2010.txt", sep="\t", skiprows=1,
                         names=["charge_num", "state", "num_employees_code",
                                "num_employees", "naics_code", "naics_desc", "type_code",
                                "type", "birth_date", "sex", "date_received", "date_fepa_sent_to_eeoc",
                                "date_closed", "closure_code", "closure_action", "monetary_benefits",
                                "statute_code", "statute", "basis_code", "basis", "issue_code", "issue",
                                "court_filing_date", "civil_action_num", "court", "resolution_date",
                                "litigation_monetary_benefits", "case_type"],
                         dtype={"state": "category", "num_employees_code": "category",
                                "num_employees": "category", "naics_code": "category",
                                "naics_desc": "category", "type_code": "category",
                                "type": "category", "sex": "category",
                                "closure_code": "category", "closure_action": "category",
                                "statute_code": "category", "statute": "category",
                                "basis_code": "category", "basis": "category",
                                "issue_code": "category", "issue": "category",
                                "court": "category", "case_type": "category"})
charges_10.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343863 entries, 0 to 343862
Data columns (total 28 columns):
charge_num                      343863 non-null object
state                           343801 non-null category
num_employees_code              328035 non-null category
num_employees                   328035 non-null category
naics_code                      187222 non-null category
naics_desc                      185282 non-null category
type_code                       343829 non-null category
type                            343829 non-null category
birth_date                      310000 non-null object
sex                             343453 non-null category
date_received                   343863 non-null object
date_fepa_sent_to_eeoc          20188 non-null object
date_closed                     230050 non-null object
closure_code                    230050 non-null category
closure_action                  230050 non-null category
monetary_benefits               37964 non-nul

The names and order of columns in the 2010 charge data differ from those of the 2011 through 2017 data. We need to delete, add, rename and reorder the columns before concatenating the data.

In [8]:
charges_10.drop("date_fepa_sent_to_eeoc", axis=1, inplace=True)
charges_10["fiscal_year"] = ""
charges_10 = charges_10[["fiscal_year", "charge_num", "state", "num_employees_code", "num_employees", "naics_code", "naics_desc", "type_code",
                         "type", "birth_date", "sex", "date_received", "date_closed", "closure_code", "closure_action", "monetary_benefits",
                         "statute_code", "statute", "basis_code", "basis", "issue_code", "issue", "court_filing_date", "civil_action_num",
                         "court", "resolution_date", "case_type", "litigation_monetary_benefits"]]
charges_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343863 entries, 0 to 343862
Data columns (total 28 columns):
fiscal_year                     343863 non-null object
charge_num                      343863 non-null object
state                           343801 non-null category
num_employees_code              328035 non-null category
num_employees                   328035 non-null category
naics_code                      187222 non-null category
naics_desc                      185282 non-null category
type_code                       343829 non-null category
type                            343829 non-null category
birth_date                      310000 non-null object
sex                             343453 non-null category
date_received                   343863 non-null object
date_closed                     230050 non-null object
closure_code                    230050 non-null category
closure_action                  230050 non-null category
monetary_benefits               37964 non-nu

Concatenate the two sets of charge data.

In [9]:
charges = pd.concat([charges, charges_10], ignore_index=True)
charges.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3787372 entries, 0 to 3787371
Data columns (total 28 columns):
fiscal_year                     object
charge_num                      object
state                           object
num_employees_code              object
num_employees                   category
naics_code                      object
naics_desc                      object
type_code                       object
type                            object
birth_date                      object
sex                             category
date_received                   object
date_closed                     object
closure_code                    object
closure_action                  object
monetary_benefits               object
statute_code                    object
statute                         category
basis_code                      object
basis                           object
issue_code                      category
issue                           category
court_filing_date  

Convert the date columns.

In [10]:
charges["birth_date"] = pd.to_datetime(charges["birth_date"], errors="coerce", format="%m/%d/%y")
charges["date_received"] = pd.to_datetime(charges["date_received"], errors="coerce", format="%m/%d/%y")
charges["date_closed"] = pd.to_datetime(charges["date_closed"], errors="coerce", format="%m/%d/%y")
charges["court_filing_date"] = pd.to_datetime(charges["court_filing_date"], errors="coerce", format="%m/%d/%y")
charges["resolution_date"] = pd.to_datetime(charges["resolution_date"], errors="coerce", format="%m/%d/%y")
charges.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3787372 entries, 0 to 3787371
Data columns (total 28 columns):
fiscal_year                     object
charge_num                      object
state                           object
num_employees_code              object
num_employees                   category
naics_code                      object
naics_desc                      object
type_code                       object
type                            object
birth_date                      datetime64[ns]
sex                             category
date_received                   datetime64[ns]
date_closed                     datetime64[ns]
closure_code                    object
closure_action                  object
monetary_benefits               object
statute_code                    object
statute                         category
basis_code                      object
basis                           object
issue_code                      category
issue                           cate

In [11]:
charges_10["birth_date"] = pd.to_datetime(charges_10["birth_date"], errors="coerce", format="%m/%d/%y")
charges_10["date_received"] = pd.to_datetime(charges_10["date_received"], errors="coerce", format="%m/%d/%y")
charges_10["date_closed"] = pd.to_datetime(charges_10["date_closed"], errors="coerce", format="%m/%d/%y")
charges_10["court_filing_date"] = pd.to_datetime(charges_10["court_filing_date"], errors="coerce", format="%m/%d/%y")
charges_10["resolution_date"] = pd.to_datetime(charges_10["resolution_date"], errors="coerce", format="%m/%d/%y")
charges_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343863 entries, 0 to 343862
Data columns (total 28 columns):
fiscal_year                     343863 non-null object
charge_num                      343863 non-null object
state                           343801 non-null category
num_employees_code              328035 non-null category
num_employees                   328035 non-null category
naics_code                      187222 non-null category
naics_desc                      185282 non-null category
type_code                       343829 non-null category
type                            343829 non-null category
birth_date                      310000 non-null datetime64[ns]
sex                             343453 non-null category
date_received                   343863 non-null datetime64[ns]
date_closed                     230050 non-null datetime64[ns]
closure_code                    230050 non-null category
closure_action                  230050 non-null category
monetary_benefits   