In [34]:
import pandas as pd

### Data Columns

The data extract did not come with columns, so those are defined below.

In [35]:
columns = [
    'Act-Pk',
    'ActivityNumber',
    'ActivityType',
    'activitySubType',
    'agency',
    'activityDate',
    'Per_Role',
    'DOB',
    'Age',
    'Gender',
    'Race',	
    'highest_violationSection',
    'highest_violationType',
    'highest_chargeCategory',
    'highest_chargeDescription',
    'highest_codeBcsClass',
    'highest_codeBcsCategory',
    'highest_chargeLevel',	
    'macrDisposition',
    'macrIndicationLevel',
    'macrJuvenileLevel',
    'macrStatus1',
    'codeBcsCategory',
    'codeBcsClass',
    'codeUcr',
    'CrimeCategory',
    'CrimeDescription',
    'Loc_Role',
    'city',
    'zipCode',
    'censusBlock',
    'censusTract',
    'BCSCode',
    'MACRCode',
    'BCSDesc',
    'MACRDesc',
    'macrStatus',
    'dispositionCode',
    'bcsClassCode',
    'reportYear',
    'reportMonth',
    'BCAArea',
]

### Merging the two extracts

We had to send multiple data requests to get everything we need. The first new pull we got Race/Ethnicity but didn't get BCAArea, the second pull we got BCAArea but didn't get Race/Ethnicity. Anything with 9_20 at the end was the second pull.

They also came in separate files, so we have to merge them all together.

The cell below uses the one without Race/Ethnicity as the left table, and joins the other as the right table.

The order was the same, so we can just exclude the column that was missing in both.

In [36]:
import pathlib

base_path = pathlib.Path("C:/Users/eli/OneDrive - San Diego Association of Governments/Documents - SANDAG QA_QC/Service Requests/2022/2022-69 Arrest Report QC/data/")

ldf = pd.concat([
    pd.read_csv(base_path / "Arrest Bulletin Data 2016 09_20.txt", delimiter='\t', dtype='string[pyarrow]',header=None, names=columns),
    pd.read_csv(base_path / 'Arrest Bulletin Data 2017 09_20.txt', delimiter='\t', dtype='string[pyarrow]',header=None, names=columns),
    pd.read_csv(base_path / 'Arrest Bulletin Data 2018 09_20.txt', delimiter='\t', dtype='string[pyarrow]',header=None, names=columns),
    pd.read_csv(base_path / 'Arrest Bulletin Data 2019 09_20.txt', delimiter='\t', dtype='string[pyarrow]',header=None, names=columns),
    pd.read_csv(base_path / 'Arrest Bulletin Data 2020 09_20.txt', delimiter='\t', dtype='string[pyarrow]',header=None, names=columns),
    pd.read_csv(base_path / 'Arrest Bulletin Data 2021 09_20.txt', delimiter='\t', dtype='string[pyarrow]',header=None, names=columns),
])
ldf = ldf.drop("Race", axis=1)

In [37]:
rdf = pd.concat([
    pd.read_csv(base_path / 'Arrest Bulletin Data 2016_17.txt', delimiter='\t', dtype='string[pyarrow]',header=None, names=columns),
    pd.read_csv(base_path / 'Arrest Bulletin Data 2018_19.txt', delimiter='\t', dtype='string[pyarrow]',header=None, names=columns),
    pd.read_csv(base_path / 'Arrest Bulletin Data 2020.txt', delimiter='\t', dtype='string[pyarrow]',header=None, names=columns),
    pd.read_csv(base_path / 'Arrest Bulletin Data 2021.txt', delimiter='\t', dtype='string[pyarrow]',header=None, names=columns),
])
rdf = rdf.drop("BCAArea", axis=1)

In [38]:
ldf.columns

Index(['Act-Pk', 'ActivityNumber', 'ActivityType', 'activitySubType', 'agency',
       'activityDate', 'Per_Role', 'DOB', 'Age', 'Gender',
       'highest_violationSection', 'highest_violationType',
       'highest_chargeCategory', 'highest_chargeDescription',
       'highest_codeBcsClass', 'highest_codeBcsCategory',
       'highest_chargeLevel', 'macrDisposition', 'macrIndicationLevel',
       'macrJuvenileLevel', 'macrStatus1', 'codeBcsCategory', 'codeBcsClass',
       'codeUcr', 'CrimeCategory', 'CrimeDescription', 'Loc_Role', 'city',
       'zipCode', 'censusBlock', 'censusTract', 'BCSCode', 'MACRCode',
       'BCSDesc', 'MACRDesc', 'macrStatus', 'dispositionCode', 'bcsClassCode',
       'reportYear', 'reportMonth', 'BCAArea'],
      dtype='object')

In [39]:
ldf.head()

Unnamed: 0,Act-Pk,ActivityNumber,ActivityType,activitySubType,agency,activityDate,Per_Role,DOB,Age,Gender,...,BCSCode,MACRCode,BCSDesc,MACRDesc,macrStatus,dispositionCode,bcsClassCode,reportYear,reportMonth,BCAArea
0,12162665,'16-0008601',ARREST,ADULT,HARBOR POLICE,2016-01-01 00:01:00.000,SUSPECT,1961/12/03,54,FEMALE,...,M02,BATTERY,ASSAULT AND BATTERY,3.0,3.0,397.0,2016.0,1.0,,
1,12162512,'15-0609601',ARREST,ADULT,HARBOR POLICE,2016-01-01 00:01:00.000,SUSPECT,1961/12/03,54,FEMALE,...,M02,BATTERY,ASSAULT AND BATTERY,,,,,,,
2,12125654,'1610003101',ARREST,ADULT,SHERIFF,2016-01-01 00:05:00.000,SUSPECT,1966/06/02,49,MALE,...,F19,PROHIBITED WEAPONS,WEAPONS,3.0,4.0,844.0,2016.0,1.0,,
3,12141169,'1600000101',ARREST,ADULT,ESCONDIDO,2016-01-01 00:05:00.000,SUSPECT,1964/05/29,51,MALE,...,M18,TAMPERING WITH VEHICLE,VANDALISM,3.0,3.0,66.0,2016.0,1.0,,
4,12125886,'1610000001',ARREST,ADULT,SHERIFF,2016-01-01 00:06:00.000,SUSPECT,1958/01/25,57,MALE,...,M14,DRUNK,DRUNK: PROT/CUS,3.0,3.0,46.0,2016.0,1.0,CITY OF LEMON GROVE,


In [40]:
rdf.columns

Index(['Act-Pk', 'ActivityNumber', 'ActivityType', 'activitySubType', 'agency',
       'activityDate', 'Per_Role', 'DOB', 'Age', 'Gender', 'Race',
       'highest_violationSection', 'highest_violationType',
       'highest_chargeCategory', 'highest_chargeDescription',
       'highest_codeBcsClass', 'highest_codeBcsCategory',
       'highest_chargeLevel', 'macrDisposition', 'macrIndicationLevel',
       'macrJuvenileLevel', 'macrStatus1', 'codeBcsCategory', 'codeBcsClass',
       'codeUcr', 'CrimeCategory', 'CrimeDescription', 'Loc_Role', 'city',
       'zipCode', 'censusBlock', 'censusTract', 'BCSCode', 'MACRCode',
       'BCSDesc', 'MACRDesc', 'macrStatus', 'dispositionCode', 'bcsClassCode',
       'reportYear', 'reportMonth'],
      dtype='object')

In [41]:
rdf.head()

Unnamed: 0,Act-Pk,ActivityNumber,ActivityType,activitySubType,agency,activityDate,Per_Role,DOB,Age,Gender,...,censusTract,BCSCode,MACRCode,BCSDesc,MACRDesc,macrStatus,dispositionCode,bcsClassCode,reportYear,reportMonth
0,12162665,'16-0008601',ARREST,ADULT,HARBOR POLICE,2016-01-01 00:01:00.000,SUSPECT,1961/12/03,54,FEMALE,...,7100.0,397,M02,BATTERY,ASSAULT AND BATTERY,3.0,3.0,397.0,2016.0,1.0
1,12162512,'15-0609601',ARREST,ADULT,HARBOR POLICE,2016-01-01 00:01:00.000,SUSPECT,1961/12/03,54,FEMALE,...,7100.0,397,M02,BATTERY,ASSAULT AND BATTERY,,,,,
2,12125654,'1610003101',ARREST,ADULT,SHERIFF,2016-01-01 00:05:00.000,SUSPECT,1966/06/02,49,MALE,...,,844,F19,PROHIBITED WEAPONS,WEAPONS,3.0,4.0,844.0,2016.0,1.0
3,12141169,'1600000101',ARREST,ADULT,ESCONDIDO,2016-01-01 00:05:00.000,SUSPECT,1964/05/29,51,MALE,...,20308.0,66,M18,TAMPERING WITH VEHICLE,VANDALISM,3.0,3.0,66.0,2016.0,1.0
4,12125886,'1610000001',ARREST,ADULT,SHERIFF,2016-01-01 00:06:00.000,SUSPECT,1958/01/25,57,MALE,...,14400.0,46,M14,DRUNK,DRUNK: PROT/CUS,3.0,3.0,46.0,2016.0,1.0


So it's not a perfect merge, but as shown below only one record would really be affected.

We could manually pull that record and manually make sure we record the first with Hispanic and the second with other.

In [42]:
# Count all occurances of this compound key, and only show anything > 1
ldf.groupby(['Act-Pk', 'ActivityNumber', 'activityDate', 'Gender', 'DOB'])['Act-Pk'].size().sort_values(ascending=False)[lambda x: x>1]

Act-Pk    ActivityNumber  activityDate             Gender     DOB       
19666800  '19-1105'       2019-05-17 00:00:00.000  NONBINARY  1972/08/29    2
19069112  '16-0978'       2016-05-15 00:00:00.000  MALE       1990/06/29    2
20315656  '16-2868'       2016-12-31 15:28:00.000  MALE       1996/07/30    2
Name: Act-Pk, dtype: int64

In [43]:
print(
    'Races of 19069112 duplicate:',
    rdf.groupby(['Act-Pk', 'ActivityNumber', 'activityDate', 'Gender', 'DOB'])['Race'].first().loc[('19069112', "'16-0978'", '2016-05-15 00:00:00.000', 'MALE', '1990/06/29')],
    rdf.groupby(['Act-Pk', 'ActivityNumber', 'activityDate', 'Gender', 'DOB'])['Race'].last().loc[('19069112', "'16-0978'", '2016-05-15 00:00:00.000', 'MALE', '1990/06/29')],
)

print(
    'Races of 19666800 duplicate:',
    rdf.groupby(['Act-Pk', 'ActivityNumber', 'activityDate', 'Gender', 'DOB'])['Race'].first().loc[('19666800', "'19-1105'", '2019-05-17 00:00:00.000', 'NONBINARY', '1972/08/29')],
    rdf.groupby(['Act-Pk', 'ActivityNumber', 'activityDate', 'Gender', 'DOB'])['Race'].last().loc[('19666800', "'19-1105'", '2019-05-17 00:00:00.000', 'NONBINARY', '1972/08/29')],
)

print(
    'Races of 20315656 duplicate:',
    rdf.groupby(['Act-Pk', 'ActivityNumber', 'activityDate', 'Gender', 'DOB'])['Race'].first().loc[('20315656', "'16-2868'", '2016-12-31 15:28:00.000', 'MALE', '1996/07/30')],
    rdf.groupby(['Act-Pk', 'ActivityNumber', 'activityDate', 'Gender', 'DOB'])['Race'].last().loc[('20315656', "'16-2868'", '2016-12-31 15:28:00.000', 'MALE', '1996/07/30')],
)



Races of 19069112 duplicate: HISPANIC OTHER
Races of 19666800 duplicate: OTHER OTHER
Races of 20315656 duplicate: WHITE WHITE


Cell below actually performs the merge.

In [44]:
# Drop the 3 duplicated keys on the right table.
# So we only have a single one for the merge.
rdf = rdf[~rdf.duplicated(['Act-Pk', 'ActivityNumber', 'activityDate', 'Gender', 'DOB'])]

In [45]:
# Merge the right dataframe with Race/Ethnicity onto the left dataframe without it, using the key we discussed above.
ldf['Race'] = ldf.merge(
    rdf,
    on=['Act-Pk', 'ActivityNumber', 'activityDate', 'Gender', 'DOB'],
)['Race']

In [46]:
# The number of individual arrests reported for each year.
ldf['reportYear'].value_counts()

08    41057
10    41015
01    40660
03    40431
09    40293
07    39624
02    39161
05    38414
04    37921
06    37777
11    36499
12    35249
Name: reportYear, dtype: Int64

In [47]:
# This was tried earlier to fix an issue, but may be unecessary.
# I *think* MS SQL reads "NULL" as NULL on import, so I manually coded anything that was pandas <NA> into the string "NULL"

from tqdm.auto import tqdm
tqdm.pandas()

for column in tqdm([
        'highest_codeBcsClass',
        'highest_codeBcsCategory',
        'BCSDesc',
        'macrStatus',
        'highest_chargeLevel',
        'censusBlock',
        'censusTract',
        'MACRCode',
        'MACRDesc',
        'macrStatus',
        'Gender',
        'codeUcr',
        'CrimeCategory',
        'CrimeDescription',
        'highest_chargeCategory',
        'BCAArea',
        'Race',
    ]):
    ldf[column] = ldf[column].progress_apply(lambda x: 'NULL' if pd.isna(x) else x)


100%|██████████| 1314449/1314449 [00:01<00:00, 779639.69it/s]
100%|██████████| 1314449/1314449 [00:01<00:00, 825932.77it/s]
100%|██████████| 1314449/1314449 [00:01<00:00, 789447.23it/s]
100%|██████████| 1314449/1314449 [00:01<00:00, 790760.37it/s]
100%|██████████| 1314449/1314449 [00:01<00:00, 880786.95it/s]
100%|██████████| 1314449/1314449 [00:01<00:00, 868946.21it/s]
100%|██████████| 1314449/1314449 [00:01<00:00, 778378.59it/s]
100%|██████████| 1314449/1314449 [00:01<00:00, 696109.63it/s]
100%|██████████| 1314449/1314449 [00:01<00:00, 707675.00it/s]
100%|██████████| 1314449/1314449 [00:02<00:00, 636566.79it/s]
100%|██████████| 1314449/1314449 [00:02<00:00, 621839.52it/s]
100%|██████████| 1314449/1314449 [00:01<00:00, 815108.44it/s]
100%|██████████| 1314449/1314449 [00:01<00:00, 773772.93it/s]
100%|██████████| 1314449/1314449 [00:01<00:00, 850259.31it/s]
100%|██████████| 1314449/1314449 [00:01<00:00, 709846.93it/s] 
100%|██████████| 1314449/1314449 [00:01<00:00, 925093.41it/s]
100%|██

In [48]:
ldf['highest_codeBcsClass'] = ldf['highest_codeBcsClass'].str.strip("'")

In [49]:
ldf['MACRCode'] = ldf['MACRCode'].str.strip()

In [50]:
ldf['Race'] = ldf['Race'].str.strip()

In [51]:
local = pathlib.Path("./data/")
ldf.to_csv(local / 'CJ_extract_2021.csv', header=None, index=None, sep='\t')
ldf.to_csv(local / 'CJ_extract_2021_header.csv', index=None, sep=',')

In [52]:
ldf.iloc[5-1, 0:16-1]

Act-Pk                                                                12125886
ActivityNumber                                                    '1610000001'
ActivityType                                                            ARREST
activitySubType                                                          ADULT
agency                                                                 SHERIFF
activityDate                                           2016-01-01 00:06:00.000
Per_Role                                                               SUSPECT
DOB                                                                 1958/01/25
Age                                                                         57
Gender                                                                    MALE
highest_violationSection                                                    PC
highest_violationType                                       OTHER PART 2 CRIME
highest_chargeCategory       DRUNK IN PUBLIC: ALCOHO

In [53]:
dfc = pd.read_csv(local / 'CJ_extract_2021_header.csv', keep_default_na=False)

In [54]:
dfc

Unnamed: 0,Act-Pk,ActivityNumber,ActivityType,activitySubType,agency,activityDate,Per_Role,DOB,Age,Gender,...,MACRCode,BCSDesc,MACRDesc,macrStatus,dispositionCode,bcsClassCode,reportYear,reportMonth,BCAArea,Race
0,12162665,'16-0008601',ARREST,ADULT,HARBOR POLICE,2016-01-01 00:01:00.000,SUSPECT,1961/12/03,54,FEMALE,...,BATTERY,ASSAULT AND BATTERY,3,3,397,2016,01,,,WHITE
1,12162512,'15-0609601',ARREST,ADULT,HARBOR POLICE,2016-01-01 00:01:00.000,SUSPECT,1961/12/03,54,FEMALE,...,BATTERY,ASSAULT AND BATTERY,,,,,,,,WHITE
2,12125654,'1610003101',ARREST,ADULT,SHERIFF,2016-01-01 00:05:00.000,SUSPECT,1966/06/02,49,MALE,...,PROHIBITED WEAPONS,WEAPONS,3,4,844,2016,01,,,BLACK
3,12141169,'1600000101',ARREST,ADULT,ESCONDIDO,2016-01-01 00:05:00.000,SUSPECT,1964/05/29,51,MALE,...,TAMPERING WITH VEHICLE,VANDALISM,3,3,066,2016,01,,,WHITE
4,12125886,'1610000001',ARREST,ADULT,SHERIFF,2016-01-01 00:06:00.000,SUSPECT,1958/01/25,57,MALE,...,DRUNK,DRUNK: PROT/CUS,3,3,046,2016,01,CITY OF LEMON GROVE,,HISPANIC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1314444,20826658,'2200003901',ARREST,ADULT,SAN DIEGO,2021-12-31 23:50:00.000,SUSPECT,1998/06/12,23,MALE,...,DRUNK,DRUNK: PROT/CUS,3,1,046,2021,12,,,OTHER
1314445,20831732,'2101749101',ARREST,ADULT,OCEANSIDE,2021-12-31 23:50:00.000,SUSPECT,1974/07/21,47,MALE,...,VISITING-SCHED I-II,OTHER DRUGS,1,3,818,2021,12,,,WHITE
1314446,20837514,'22-0000201',ARREST,ADULT,HARBOR POLICE,2021-12-31 23:54:00.000,SUSPECT,1993/09/06,28,MALE,...,,,,,,,,,,HISPANIC
1314447,20825713,'2115682501',ARREST,ADULT,SHERIFF,2021-12-31 23:57:00.000,SUSPECT,1995/11/20,26,MALE,...,THREATEN/ASSAULT AN OFFICER,ASSAULT,2,4,345,2021,12,,,HISPANIC


In [55]:
dfc[['highest_codeBcsClass', 'highest_codeBcsCategory', 'BCSDesc']]

Unnamed: 0,highest_codeBcsClass,highest_codeBcsCategory,BCSDesc
0,4,MISDEMEANOR,ASSAULT AND BATTERY
1,4,MISDEMEANOR,ASSAULT AND BATTERY
2,D,FELONY,WEAPONS
3,M,MISDEMEANOR,VANDALISM
4,A,MISDEMEANOR,DRUNK: PROT/CUS
...,...,...,...
1314444,A,MISDEMEANOR,DRUNK: PROT/CUS
1314445,N,MISDEMEANOR,OTHER DRUGS
1314446,A,OTHER,
1314447,4,FELONY,ASSAULT


In [56]:
rdf.iloc[:, 10]

0              WHITE
1              WHITE
2              BLACK
3              WHITE
4           HISPANIC
             ...    
183008         WHITE
183009         WHITE
183010         WHITE
183011         WHITE
183012    VIETNAMESE
Name: Race, Length: 1648557, dtype: string

In [57]:
groups = list(dfc[
    [
        'codeUcr',
        'CrimeCategory',
        'CrimeDescription',
        'highest_chargeCategory',

    ]
].groupby([
        'codeUcr',
        'CrimeCategory',
        'CrimeDescription',
        'highest_chargeCategory',
]).groups)


In [58]:
dfc['agency'].unique()

array(['HARBOR POLICE', 'SHERIFF', 'ESCONDIDO', 'SAN DIEGO',
       'CHULA VISTA', 'OCEANSIDE', 'CARLSBAD', 'NATIONAL CITY', 'LA MESA',
       'EL CAJON', 'SYCUAN TRIBAL POLICE DEPARTMENT', 'CORONADO',
       'SD STATE UNIV.', 'SW COMM COL PD', 'MTS POLICE', 'SD COMM COL PD',
       'DISTRICT ATTORNEY'], dtype=object)

In [59]:
groups = list(dfc[
    [
        'highest_codeBcsClass',
        'highest_codeBcsCategory',
        'BCSDesc',
    ]
].groupby([
        'highest_codeBcsClass',
        'highest_codeBcsCategory',
        'BCSDesc',
]).groups)


In [60]:
groups

[('1', 'FELONY', 'MANSLAUGHTER NONVEH'),
 ('1', 'FELONY', 'WILLFUL HOMICIDE'),
 ('1', 'OTHER', 'WILLFUL HOMICIDE'),
 ('2', 'FELONY', 'FORCIBLE RAPE'),
 ('2H', 'FELONY', 'OTHER SEX'),
 ('2N', 'FELONY', 'ANNOY CHILD'),
 ('2N', 'FELONY', 'FORCIBLE RAPE'),
 ('2N', 'FELONY', 'LEWD CONDUCT'),
 ('2N', 'FELONY', 'LEWD OR LASCIVIOUS'),
 ('2N', 'FELONY', 'OTHER SEX'),
 ('3', 'FELONY', 'KIDNAPPING'),
 ('3', 'FELONY', 'OTHER FELONY'),
 ('3', 'FELONY', 'ROBBERY'),
 ('4', 'FELONY', 'ASSAULT'),
 ('4', 'FELONY', 'ASSAULT AND BATTERY'),
 ('4', 'FELONY', 'ESCAPE'),
 ('4', 'FELONY', 'LEWD OR LASCIVIOUS'),
 ('4', 'FELONY', 'MANSLAUGHTER NONVEH'),
 ('4', 'FELONY', 'OTHER FELONY'),
 ('4', 'FELONY', 'OTHER SEX'),
 ('4', 'MISDEMEANOR', 'ASSAULT AND BATTERY'),
 ('4', 'MISDEMEANOR', 'OTHER MISDEMEANOR'),
 ('4', 'MISDEMEANOR', 'PETTY THEFT'),
 ('4', 'OTHER', 'NULL'),
 ('5', 'FELONY', 'BURGLARY'),
 ('5', 'MISDEMEANOR', 'PETTY THEFT'),
 ('6', 'FELONY', 'THEFT'),
 ('6', 'MISDEMEANOR', 'JOY RIDING'),
 ('6', 'MISDEME

In [61]:
groups = list(dfc[
    [
    'MACRCode',
    'MACRDesc',
    'macrStatus1',
    ]
].groupby([
    'MACRCode',
    'MACRDesc',
    'macrStatus1',
]).groups)
groups

[('ABANDONMENT', '2', 'C'),
 ('ABANDONMENT', '3', 'C'),
 ('ABDUCTION FOR DEFILEMENT', '2', 'S'),
 ('ABORTION - SOLICITING WOMEN', '2', 'C'),
 ('ACCESSORY', '1', 'A'),
 ('ACCESSORY', '2', 'A'),
 ('ACCESSORY', '3', 'A'),
 ('ACCESSORY', 'NULL', 'A'),
 ('ADULT INFL MINOR SCHED I-II', '2', 'N'),
 ('ADULT USE MINOR SCHED III-IV-V', '2', 'N'),
 ('ADW ON PEACE OFFICER', '2', '4'),
 ('ADW ON PEACE OFFICER', '3', '4'),
 ('ADW ON PEACE OFFICER', 'NULL', '4'),
 ('ADW ON PEACE OFFICER W/FIREARM', '2', '4'),
 ('ADW ON PEACE OFFICER W/FIREARM', '3', '4'),
 ('ADW ON PEACE OFFICER W/FIREARM', 'NULL', '4'),
 ('ALTERING MARKS ON GUN', '2', 'D'),
 ('ALTERING MARKS ON GUN', '3', 'D'),
 ('ANNOY CHILD', '2', 'S'),
 ('ANNOY CHILD', '3', 'S'),
 ('ANNOY/MOLEST CHILD', '1', 'S'),
 ('ANNOY/MOLEST CHILD', '2', '2N'),
 ('ANNOY/MOLEST CHILD', '2', 'C'),
 ('ANNOY/MOLEST CHILD', '2', 'S'),
 ('ANNOY/MOLEST CHILD', '3', 'C'),
 ('ANNOY/MOLEST CHILD', '3', 'S'),
 ('ANNOY/MOLEST CHILD', 'NULL', 'S'),
 ('ANNOY/MOLEST CHILD 

In [62]:
pd.Series([x for x in ''.join([x for x in dfc['highest_codeBcsClass']])]).value_counts()

A    739535
Z    221238
N    128071
F     90814
4     64328
L     45934
U     22967
6     19891
D     10300
C      7447
5      7360
M      6639
7      6348
3      5428
S      4106
Y       986
8       658
2       600
E       433
1       417
G        41
I        25
H         5
dtype: int64

In [63]:
max([len(x) for x in dfc.iloc[:, 14]])

4

In [64]:
dfc.sample(100).to_csv(local / 'sample_2.csv', index=None)

In [65]:
dfc['Race'].value_counts(dropna=False)

WHITE               555038
HISPANIC            409845
BLACK               160245
OTHER               104135
OTHER ASIAN          29916
HAWAIIAN             28930
FILIPINO             10329
INDIAN                4075
VIETNAMESE            3009
CHINESE               1906
PACIFIC ISLANDER      1573
SAMOAN                1300
LAOTIAN                826
KOREAN                 712
ASIAN INDIAN           632
JAPANESE               585
CAMBODIAN              531
MIDDLE EASTERN         483
GUAMANIAN              371
EAST AFRICAN             8
Name: Race, dtype: int64

In [66]:
sum(dfc['Race'].isna())

0

In [67]:
a = {
    'ASIAN INDIAN',
    'BLACK',
    'CAMBODIAN',
    'CHINESE',
    'EAST AFRICAN',
    'FILIPINO',
    'GUAMANIAN',
    'HAWAIIAN',
    'HISPANIC',
    'INDIAN',
    'JAPANESE',
    'KOREAN',
    'LAOTIAN',
    'MIDDLE EASTERN',
    'OTHER',
    'OTHER ASIAN',
    'PACIFIC ISLANDER',
    'SAMOAN',
    'VIETNAMESE',
    'WHITE',
    'Not Applicable',
    'Missing',
}
b = set(dfc['Race'].unique())

b.difference(a)

set()

In [68]:
len(dfc.columns)

42