In [1]:
import pandas as pd
pd.set_option("display.max_columns", 50)

## Format 2000-2010 data

Concatenate data from the EEOC.

In [2]:
!sh process_eeoc.sh

Import complainant data for fiscal year 2000 through Feb. 28, 2010.

In [3]:
complainants = pd.read_csv("data/raw/complainants_00_09.txt", sep="\t", skiprows=1,
                           names=["complainant_id", "date_received", "date_closed",
                                  "receiving_office", "accountability_office", "eeoc_or_fepa",
                                  "processing_category", "statute", "basis", "issue", "city",
                                  "county", "state", "naics", "num_employees", "type", "race",
                                  "sex", "nat_origin", "birth_date"],
                           dtype={"receiving_office": "category", "accountability_office": "category",
                                  "eeoc_or_fepa": "category","processing_category": "category", "statute": "category",
                                  "basis": "category", "issue": "category","city": "category", "county": "category",
                                  "state": "category", "naics": "category", "num_employees": "category",
                                  "type": "category", "race": "category", "sex": "category", "nat_origin": "category"})
complainants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 925611 entries, 0 to 925610
Data columns (total 20 columns):
complainant_id           925611 non-null object
date_received            925611 non-null object
date_closed              836105 non-null object
receiving_office         925611 non-null category
accountability_office    925611 non-null category
eeoc_or_fepa             925611 non-null category
processing_category      921323 non-null category
statute                  925530 non-null category
basis                    925274 non-null category
issue                    925573 non-null category
city                     925536 non-null category
county                   913908 non-null category
state                    925356 non-null category
naics                    620933 non-null category
num_employees            900728 non-null category
type                     925463 non-null category
race                     838595 non-null category
sex                      925236 non-null cate

Convert the date columns.

In [4]:
complainants["date_received"] = pd.to_datetime(complainants["date_received"], errors="coerce", format="%m/%d/%y")
complainants["date_closed"] = pd.to_datetime(complainants["date_closed"], errors="coerce", format="%m/%d/%y")
complainants["birth_date"] = pd.to_datetime(complainants["birth_date"], errors="coerce", format="%m/%d/%y")
complainants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 925611 entries, 0 to 925610
Data columns (total 20 columns):
complainant_id           925611 non-null object
date_received            925611 non-null datetime64[ns]
date_closed              836105 non-null datetime64[ns]
receiving_office         925611 non-null category
accountability_office    925611 non-null category
eeoc_or_fepa             925611 non-null category
processing_category      921323 non-null category
statute                  925530 non-null category
basis                    925274 non-null category
issue                    925573 non-null category
city                     925536 non-null category
county                   913908 non-null category
state                    925356 non-null category
naics                    620933 non-null category
num_employees            900728 non-null category
type                     925463 non-null category
race                     838595 non-null category
sex                      9252

Because the complainant data only contains records through the end of February 2010, we will remove records with a date_received after September 30, 2009 in favor of FY 2010 charge data.

In [5]:
complainants = complainants[complainants["date_received"] <= "2009-09-30"]
complainants.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 886211 entries, 0 to 925610
Data columns (total 20 columns):
complainant_id           886211 non-null object
date_received            886211 non-null datetime64[ns]
date_closed              828250 non-null datetime64[ns]
receiving_office         886211 non-null category
accountability_office    886211 non-null category
eeoc_or_fepa             886211 non-null category
processing_category      883209 non-null category
statute                  886130 non-null category
basis                    885874 non-null category
issue                    886173 non-null category
city                     886137 non-null category
county                   874893 non-null category
state                    885963 non-null category
naics                    600591 non-null category
num_employees            863215 non-null category
type                     886065 non-null category
race                     805058 non-null category
sex                      8858

## Format 2010-2017 data

Import charge data for fiscal years 2011-2017.

In [6]:
charges = pd.read_csv("data/raw/charges_11_17.txt", sep="\t", skiprows=1,
                      names=["fiscal_year", "charge_num", "state", "num_employees_code",
                             "num_employees", "naics_code", "naics_desc", "type_code",
                             "type", "birth_date", "sex", "date_received", "date_closed",
                             "closure_code", "closure_action", "monetary_benefits", "statute_code",
                             "statute", "basis_code", "basis", "issue_code", "issue",
                             "court_filing_date", "civil_action_num", "court", "resolution_date",
                             "case_type", "litigation_monetary_benefits"])
charges.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3443509 entries, 0 to 3443508
Data columns (total 28 columns):
fiscal_year                     object
charge_num                      object
state                           object
num_employees_code              object
num_employees                   object
naics_code                      float64
naics_desc                      object
type_code                       object
type                            object
birth_date                      object
sex                             object
date_received                   object
date_closed                     object
closure_code                    object
closure_action                  object
monetary_benefits               float64
statute_code                    object
statute                         object
basis_code                      object
basis                           object
issue_code                      object
issue                           object
court_filing_date          

Convert the date columns.

In [7]:
charges["birth_date"] = pd.to_datetime(charges["birth_date"], errors="coerce", format="%m/%d/%Y")
charges["date_received"] = pd.to_datetime(charges["date_received"], errors="coerce", format="%m/%d/%Y")
charges["date_closed"] = pd.to_datetime(charges["date_closed"], errors="coerce", format="%m/%d/%Y")
charges["court_filing_date"] = pd.to_datetime(charges["court_filing_date"], errors="coerce", format="%m/%d/%Y")
charges["resolution_date"] = pd.to_datetime(charges["resolution_date"], errors="coerce", format="%m/%d/%Y")
charges.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3443509 entries, 0 to 3443508
Data columns (total 28 columns):
fiscal_year                     object
charge_num                      object
state                           object
num_employees_code              object
num_employees                   object
naics_code                      float64
naics_desc                      object
type_code                       object
type                            object
birth_date                      datetime64[ns]
sex                             object
date_received                   datetime64[ns]
date_closed                     datetime64[ns]
closure_code                    object
closure_action                  object
monetary_benefits               float64
statute_code                    object
statute                         object
basis_code                      object
basis                           object
issue_code                      object
issue                           object
cou

Import charge data for fiscal year 2010.

In [8]:
charges_10 = pd.read_csv("data/raw/Genre_2010.txt", sep="\t", skiprows=1,
                         names=["charge_num", "state", "num_employees_code",
                                "num_employees", "naics_code", "naics_desc", "type_code",
                                "type", "birth_date", "sex", "date_received", "date_fepa_sent_to_eeoc",
                                "date_closed", "closure_code", "closure_action", "monetary_benefits",
                                "statute_code", "statute", "basis_code", "basis", "issue_code", "issue",
                                "court_filing_date", "civil_action_num", "court", "resolution_date",
                                "litigation_monetary_benefits", "case_type"])
charges_10.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343863 entries, 0 to 343862
Data columns (total 28 columns):
charge_num                      343863 non-null object
state                           343801 non-null object
num_employees_code              328035 non-null object
num_employees                   328035 non-null object
naics_code                      187222 non-null float64
naics_desc                      185282 non-null object
type_code                       343829 non-null object
type                            343829 non-null object
birth_date                      310000 non-null object
sex                             343453 non-null object
date_received                   343863 non-null object
date_fepa_sent_to_eeoc          20188 non-null object
date_closed                     230050 non-null object
closure_code                    230050 non-null object
closure_action                  230050 non-null object
monetary_benefits               37964 non-null object
statute_co

Convert the date columns.

In [9]:
charges_10["birth_date"] = pd.to_datetime(charges_10["birth_date"], errors="coerce", format="%m/%d/%y")
charges_10["date_received"] = pd.to_datetime(charges_10["date_received"], errors="coerce", format="%m/%d/%y")
charges_10["date_closed"] = pd.to_datetime(charges_10["date_closed"], errors="coerce", format="%m/%d/%y")
charges_10["court_filing_date"] = pd.to_datetime(charges_10["court_filing_date"], errors="coerce", format="%m/%d/%y")
charges_10["resolution_date"] = pd.to_datetime(charges_10["resolution_date"], errors="coerce", format="%m/%d/%y")
charges_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343863 entries, 0 to 343862
Data columns (total 28 columns):
charge_num                      343863 non-null object
state                           343801 non-null object
num_employees_code              328035 non-null object
num_employees                   328035 non-null object
naics_code                      187222 non-null float64
naics_desc                      185282 non-null object
type_code                       343829 non-null object
type                            343829 non-null object
birth_date                      310000 non-null datetime64[ns]
sex                             343453 non-null object
date_received                   343863 non-null datetime64[ns]
date_fepa_sent_to_eeoc          20188 non-null object
date_closed                     230050 non-null datetime64[ns]
closure_code                    230050 non-null object
closure_action                  230050 non-null object
monetary_benefits               37964 no

The columns in the 2010 charge data differ from those of the 2011 through 2017 data. We need to delete, add, rename and reorder the columns before concatenating the data.

In [10]:
charges_10.drop("date_fepa_sent_to_eeoc", axis=1, inplace=True)
charges_10["fiscal_year"] = ""
charges_10 = charges_10[["fiscal_year", "charge_num", "state", "num_employees_code", "num_employees", "naics_code", "naics_desc", "type_code",
                         "type", "birth_date", "sex", "date_received", "date_closed", "closure_code", "closure_action", "monetary_benefits",
                         "statute_code", "statute", "basis_code", "basis", "issue_code", "issue", "court_filing_date", "civil_action_num",
                         "court", "resolution_date", "case_type", "litigation_monetary_benefits"]]
charges_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343863 entries, 0 to 343862
Data columns (total 28 columns):
fiscal_year                     343863 non-null object
charge_num                      343863 non-null object
state                           343801 non-null object
num_employees_code              328035 non-null object
num_employees                   328035 non-null object
naics_code                      187222 non-null float64
naics_desc                      185282 non-null object
type_code                       343829 non-null object
type                            343829 non-null object
birth_date                      310000 non-null datetime64[ns]
sex                             343453 non-null object
date_received                   343863 non-null datetime64[ns]
date_closed                     230050 non-null datetime64[ns]
closure_code                    230050 non-null object
closure_action                  230050 non-null object
monetary_benefits               37964 n

Concatenate the two sets of charge data.

In [11]:
charges = pd.concat([charges, charges_10], ignore_index=True)
charges.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3787372 entries, 0 to 3787371
Data columns (total 28 columns):
fiscal_year                     object
charge_num                      object
state                           object
num_employees_code              object
num_employees                   object
naics_code                      float64
naics_desc                      object
type_code                       object
type                            object
birth_date                      datetime64[ns]
sex                             object
date_received                   datetime64[ns]
date_closed                     datetime64[ns]
closure_code                    object
closure_action                  object
monetary_benefits               object
statute_code                    object
statute                         object
basis_code                      object
basis                           object
issue_code                      object
issue                           object
cour

Convert the columns to their proper data types.

In [12]:
charges = charges.astype(dtype={"fiscal_year": "category", "state": "category",
                            "num_employees_code": "category", "num_employees": "category",
                            "naics_code": "category", "naics_desc": "category",
                            "type_code": "category", "type": "category", "sex": "category",
                            "closure_code": "category", "closure_action": "category",
                            "statute_code": "category", "statute": "category",
                            "basis_code": "category", "basis": "category", "issue_code": "category",
                            "issue": "category", "court": "category", "case_type": "category"})
charges["monetary_benefits"] = pd.to_numeric(charges["monetary_benefits"], errors="coerce", downcast="float")
charges["litigation_monetary_benefits"] = pd.to_numeric(charges["litigation_monetary_benefits"], errors="coerce", downcast="float")
charges.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3787372 entries, 0 to 3787371
Data columns (total 28 columns):
fiscal_year                     category
charge_num                      object
state                           category
num_employees_code              category
num_employees                   category
naics_code                      category
naics_desc                      category
type_code                       category
type                            category
birth_date                      datetime64[ns]
sex                             category
date_received                   datetime64[ns]
date_closed                     datetime64[ns]
closure_code                    category
closure_action                  category
monetary_benefits               float32
statute_code                    category
statute                         category
basis_code                      category
basis                           category
issue_code                      category
issue      