In [1]:
# Data cleaning for CT Pretrial Detainees
# (Springboard Capstone 1)
# 10 November 2018, Misty M. Giles

import pandas as pd

# Read data in and update the columns to lower case and replace spaces with _s.

df = pd.read_csv('Accused_Pre-Trial_Inmates_in_Correctional_Facilities.csv', 
                 parse_dates=['DOWNLOAD DATE', 'LATEST ADMISSION DATE'])
df.columns = ['download_date', 'identifier', 'latest_admission_date', 'race', 
              'gender', 'age', 'bond_amount', 'offense', 'facility', 'detainer']

# Check .info() to see if the dtypes are appropriate.
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2677123 entries, 0 to 2677122
Data columns (total 10 columns):
download_date            datetime64[ns]
identifier               object
latest_admission_date    datetime64[ns]
race                     object
gender                   object
age                      int64
bond_amount              int64
offense                  object
facility                 object
detainer                 object
dtypes: datetime64[ns](2), int64(2), object(6)
memory usage: 204.2+ MB
None


In [2]:
# Check the first rows to ensure that the data is expected.
print(df.head())

  download_date identifier latest_admission_date      race gender  age  \
0    2016-07-01   ZZEBBEWZ            2016-06-17     WHITE      M   63   
1    2016-07-01   ZZEBBRZC            2016-06-30     BLACK      M   42   
2    2016-07-01   ZZEBCBHC            2016-06-06     WHITE      M   60   
3    2016-07-01   ZZEBCCWL            2016-04-25  HISPANIC      M   44   
4    2016-07-01   ZZEBCHBZ            2016-06-21  HISPANIC      M   58   

   bond_amount                                   offense     facility detainer  
0         5000  ASSAULT, THIRD DEGREE                 AM  CORRIGAN CI     NONE  
1          500  FAILURE TO APPEAR, SECOND DEGREE      AM  HARTFORD CC     NONE  
2        40000  EVADING RESPONSIBILITY                 M  HARTFORD CC     NONE  
3       125000  CRIMINAL POSSESSION OF A PISTOL       DF  HARTFORD CC     NONE  
4         2500  BREACH OF PEACE                       BM  CORRIGAN CI     NONE  


In [3]:
# Remove the commas from the bond_amount column and change to int.
df['bond_amount'] = df['bond_amount'].replace(',', '', regex=True).astype(int)

In [4]:
# Change race, gender, facility, and detainer to category type and verify.
df.race = df.race.astype('category')
df.gender = df.gender.astype('category')
df.facility = df.facility.astype('category')
df.detainer = df.detainer.astype('category')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2677123 entries, 0 to 2677122
Data columns (total 10 columns):
download_date            datetime64[ns]
identifier               object
latest_admission_date    datetime64[ns]
race                     category
gender                   category
age                      int64
bond_amount              int32
offense                  object
facility                 category
detainer                 category
dtypes: category(4), datetime64[ns](2), int32(1), int64(1), object(2)
memory usage: 122.6+ MB
None


In [5]:
# Check the numbers of the categories.
print(df.race.nunique())
print(df.gender.nunique())
print(df.facility.nunique())
print(df.detainer.nunique())

5
2
50
9


In [6]:
# Check the expected values of the categories.
# .tolist() used to show entire list as needed.
print(df.race.values.unique())
print(df.gender.values.unique())
print(df.facility.values.unique().tolist())
print(df.detainer.values.unique().tolist())

[WHITE, BLACK, HISPANIC, ASIAN, AMER IND]
Categories (5, object): [WHITE, BLACK, HISPANIC, ASIAN, AMER IND]
[M, F]
Categories (2, object): [M, F]
['CORRIGAN CI', 'HARTFORD CC', 'YORK CI', 'NEW HAVEN CC', 'BRIDGEPORT CC', 'GARNER', 'MACDOUGALL', 'WALKER RC', 'NORTHERN CI', 'CHESHIRE CI', 'OSBORN CI', 'MANSON YI', 'MH-WHITING', 'FEDERAL MARSHAL', 'D KIMBALL HSP', 'ROBINSON CI', 'ST MARYS HOSP', 'MH-CVH', 'ENFIELD CI', 'HARTFORD HOSP', 'WILLARD-CYBULSKI CI', 'UCONN HOSP', 'ST VINC HOSP', 'LAWRENCE HOSP', 'WATERBRY HOSP', 'MEDICAL HOSP', 'YALE/N.H.HOSP', 'JOHNSON HOSP', 'MH-YALE/N.H.', 'MH-ST MARY', 'BROOKLYN CI', 'DANBURY HOSP', 'BACKUS HOSP', 'RADGOWSKI', 'ST FRAN HOSP', 'PO-GARIBALDI', 'BRIDGEPORT HOSP', 'PO-WILLIAMS', 'PO-CARRINGTON', '4YV', 'ST RAPH HOSP', 'MH-HTFD HOSP', 'CONSIGN @ NY', 'MH-ST FRANCIS', 'MH-UCONN', 'PO-FERRARO', 'NORWALK HOSP', 'MH-ST RAPHAEL', 'MID-STATE HOSP', 'MER-WALL HOSP']
['NONE', 'SPECIAL PAROLE', 'FEDERAL', 'STATE OF CT', 'OTHER STATE', 'IMMIGRATION', 'DO NO

In [7]:
# Next 5 blocks:
# Split data into 6-month blocks to accomodate GitHub's limitations.
# (First block includes about two weeks before the 6 months, 
#  15 June 2016 through 31 December 2016.  last block contains
#  about 6 weeks less than 6 months, 1 July 2018 through 11 November 2018.)

detainees_2016_b = df[df.download_date < '2017-01-01']
detainees_2016_b.columns.values
print(detainees_2016_b.info())
detainees_2016_b.to_csv('detainees_2016_b.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 458767 entries, 0 to 458767
Data columns (total 10 columns):
download_date            458767 non-null datetime64[ns]
identifier               458767 non-null object
latest_admission_date    458767 non-null datetime64[ns]
race                     458767 non-null category
gender                   458767 non-null category
age                      458767 non-null int64
bond_amount              458767 non-null int32
offense                  458767 non-null object
facility                 458767 non-null category
detainer                 458767 non-null category
dtypes: category(4), datetime64[ns](2), int32(1), int64(1), object(2)
memory usage: 24.5+ MB
None


In [8]:
detainees_2017_a = df.loc[(df['download_date'] >= '2017-01-01') & (df['download_date'] < '2017-07-01')]
detainees_2017_a.columns.values
print(detainees_2017_a.info())
detainees_2017_a.to_csv('detainees_2017_a.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 589433 entries, 458429 to 1048199
Data columns (total 10 columns):
download_date            589433 non-null datetime64[ns]
identifier               589433 non-null object
latest_admission_date    589433 non-null datetime64[ns]
race                     589433 non-null category
gender                   589433 non-null category
age                      589433 non-null int64
bond_amount              589433 non-null int32
offense                  589433 non-null object
facility                 589433 non-null category
detainer                 589433 non-null category
dtypes: category(4), datetime64[ns](2), int32(1), int64(1), object(2)
memory usage: 31.5+ MB
None


In [9]:
detainees_2017_b = df.loc[(df['download_date'] >= '2017-07-01') & (df['download_date'] < '2018-01-01')]
detainees_2017_b.columns.values
print(detainees_2017_b.info())
detainees_2017_b.to_csv('detainees_2017_b.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 567229 entries, 1048200 to 1615428
Data columns (total 10 columns):
download_date            567229 non-null datetime64[ns]
identifier               567229 non-null object
latest_admission_date    567229 non-null datetime64[ns]
race                     567229 non-null category
gender                   567229 non-null category
age                      567229 non-null int64
bond_amount              567229 non-null int32
offense                  567229 non-null object
facility                 567229 non-null category
detainer                 567229 non-null category
dtypes: category(4), datetime64[ns](2), int32(1), int64(1), object(2)
memory usage: 30.3+ MB
None


In [10]:
detainees_2018_a = df.loc[(df['download_date'] >= '2018-01-01') & (df['download_date'] < '2018-07-01')]
detainees_2018_a.columns.values
print(detainees_2018_a.info())
detainees_2018_a.to_csv('detainees_2018_a.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 586973 entries, 1615429 to 2202401
Data columns (total 10 columns):
download_date            586973 non-null datetime64[ns]
identifier               586973 non-null object
latest_admission_date    586973 non-null datetime64[ns]
race                     586973 non-null category
gender                   586973 non-null category
age                      586973 non-null int64
bond_amount              586973 non-null int32
offense                  586973 non-null object
facility                 586973 non-null category
detainer                 586973 non-null category
dtypes: category(4), datetime64[ns](2), int32(1), int64(1), object(2)
memory usage: 31.4+ MB
None


In [11]:
detainees_2018_b = df.loc[(df['download_date'] >= '2018-07-01') & (df['download_date'] < '2019-01-01')]
detainees_2018_b.columns.values
print(detainees_2018_b.info())
detainees_2018_b.to_csv('detainees_2018_b.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 474721 entries, 2202402 to 2677122
Data columns (total 10 columns):
download_date            474721 non-null datetime64[ns]
identifier               474721 non-null object
latest_admission_date    474721 non-null datetime64[ns]
race                     474721 non-null category
gender                   474721 non-null category
age                      474721 non-null int64
bond_amount              474721 non-null int32
offense                  474721 non-null object
facility                 474721 non-null category
detainer                 474721 non-null category
dtypes: category(4), datetime64[ns](2), int32(1), int64(1), object(2)
memory usage: 25.4+ MB
None


In [12]:
mis_2016_b_2017_a = detainees_2016_b.offense.loc[detainees_2016_b.offense.str.endswith('M')].count() + detainees_2017_a.offense.loc[detainees_2017_a.offense.str.endswith('M')].count()
fel_2016_b_2017_a = detainees_2016_b.offense.loc[detainees_2016_b.offense.str.endswith('F')].count() + detainees_2017_a.offense.loc[detainees_2017_a.offense.str.endswith('F')].count()
# unk_2016_b_2017_a = detainees_2016_b.offense.loc[detainees_2016_b.offense.str.endswith(('M','F'))].count()

print('Misdemeanors (2016_b and 2017_a) = ' + str(mis_2016_b_2017_a))
print('Felonies (2016_b and 2017_a) = ' + str(fel_2016_b_2017_a))
# print(str(unk_2016_b_2017_a))

Misdemeanors (2016_b and 2017_a) = 134382
Felonies (2016_b and 2017_a) = 674382


In [None]:
print(1048200 - (59537 + 74845) - (294689 + 379693))

In [None]:
print(pd.Series(detainees_2016_b.identifier).nunique())
print(pd.Series(detainees_2017_a.identifier).nunique())

In [35]:
print(detainees_2016_b.offense.loc[detainees_2016_b.offense.str.endswith('M')])
print(detainees_2016_b.offense.loc[detainees_2016_b.offense.str.endswith(('AM','BM','CM','DM','UM',' M'))].count())

0         ASSAULT, THIRD DEGREE                 AM
1         FAILURE TO APPEAR, SECOND DEGREE      AM
2         EVADING RESPONSIBILITY                 M
4         BREACH OF PEACE                       BM
28        RECKLESS ENDANGERMENT, FIRST DEGREE   AM
30        THREATENING                           AM
38        THREATENING                           AM
39        ASSAULT, THIRD DEGREE                 AM
48        THREATENING                           AM
60        CRIM VIOL RES ORDER                   AM
69        CRIMINAL TRESPASS, THIRD DEGREE       CM
74        INTERFERING WITH AN OFFICER           AM
85        DISORDERLY CONDUCT                    CM
86        LARCENY, SIXTH DEGREE                 CM
94        THREATENING                           AM
95        CRIMINAL IMPERSONATION                BM
99        ASSAULT, THIRD DEGREE                 AM
106       ASSAULT, THIRD DEGREE                 AM
111       ASSAULT 3RD DEGREE, VICTIM OVER 59    AM
112       INTERFERING WITH AN O

In [23]:
list(set(detainees_2016_b.offense.loc[detainees_2016_b.offense.str.endswith('M')]) - set(detainees_2016_b.offense.loc[detainees_2016_b.offense.str.endswith(('AM','BM','CM','UM',' M'))]))

['VOYEURISM', 'APPLICATION TO PURCHASE A FIREARM']

In [26]:
print(detainees_2016_b.identifier.loc[detainees_2016_b.offense.str.endswith('RM')])

24875     ZZHWHJCW
28253     ZZHWHJCW
31634     ZZHWHJCW
35019     ZZHWHJCW
38394     ZZHWHJCW
41768     ZZHWHJCW
45126     ZZHWHJCW
48470     ZZHWHJCW
51796     ZZHWHJCW
55108     ZZHWHJCW
58421     ZZHWHJCW
61729     ZZHWHJCW
65042     ZZHWHJCW
68363     ZZHWHJCW
71676     ZZHWHJCW
74986     ZZHWHJCW
78294     ZZHWHJCW
81602     ZZHWHJCW
84920     ZZHWHJCW
88228     ZZHWHJCW
91540     ZZHWHJCW
94849     ZZHWHJCW
98136     ZZHWHJCW
101430    ZZHWHJCW
104723    ZZHWHJCW
108015    ZZHWHJCW
111329    ZZHWHJCW
114631    ZZHWHJCW
117939    ZZHWHJCW
121255    ZZHWHJCW
            ...   
360859    ZZHWHJCW
364052    ZZHWHJCW
369653    ZZHWHJCW
370704    ZZHWHJCW
373620    ZZHWHJCW
378442    ZZHWHJCW
380432    ZZHWHJCW
383746    ZZHWHJCW
388637    ZZHWHJCW
390088    ZZHWHJCW
395116    ZZHWHJCW
397581    ZZHWHJCW
400967    ZZHWHJCW
406693    ZZHWHJCW
407141    ZZHWHJCW
412888    ZZHWHJCW
414630    ZZHWHJCW
418225    ZZHWHJCW
421336    ZZHWHJCW
423299    ZZHWHJCW
427409    ZZHWHJCW
430755    ZZ

In [28]:
print(detainees_2016_b.offense.loc[detainees_2016_b.offense.str.endswith('F')])
print(detainees_2016_b.offense.loc[detainees_2016_b.offense.str.endswith(('AF','BF','CF','DF','UF',' F'))].count())

3         CRIMINAL POSSESSION OF A PISTOL       DF
6         CARRYING WEAPONS WITHOUT A PERMIT      F
7         STRANGULATION 2ND DEGREE              DF
9         ASSAULT, FIRST DEGREE                 BF
10        SALE OF CONTROLLED SUBSTANCE           F
11        ARSON, FIRST DEGREE                   AF
12        CRIM VIOL OF PROTECTIVE ORDER         DF
13        EMPLOYING MINOR IN OBSCENE PERFORMNCE AF
14        ASSAULT, SECOND DEGREE                DF
16        ROBBERY, FIRST DEGREE                 BF
17        SEXUAL ASSAULT, FIRST DEGREE           F
18        SALE OF NARC/AMPHET BY NON-DEPENDENT   F
19        ROBBERY, FIRST DEGREE                 BF
20        ROBBERY, THIRD DEGREE                 DF
22        SALE OF NARC/AMPHET BY NON-DEPENDENT   F
24        SALE OF NARC/AMPHET BY NON-DEPENDENT   F
25        BURGLARY, THIRD DEGREE                DF
26        CRIM VIOL OF PROTECTIVE ORDER         DF
27        ORGANIZED RETAIL THEFT                 F
29        ASSAULT ON POLICE OR 

In [None]:
# Do a filter for the next box.  Regex != 'F' or 'M'

In [34]:
print(detainees_2016_b.offense.loc[detainees_2016_b.offense.str.endswith(('A','B','C','D','E','G','H','I','J','K','L','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',' '))])
print(detainees_2016_b.offense.loc[detainees_2016_b.offense.str.endswith(('A','B','C','D','E','G','H','I','J','K','L','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',' '))].count())

5            VIOLATION OF PROBATION OR COND DISCHG
8             SALE OF HEROIN, COC BY NON-DEPENDENT
15           VIOLATION OF PROBATION OR COND DISCHG
21                         POSSESSION OF NARCOTICS
23           VIOLATION OF PROBATION OR COND DISCHG
31                                CRIMINAL ATTEMPT
37           VIOLATION OF PROBATION OR COND DISCHG
40           VIOLATION OF PROBATION OR COND DISCHG
42           VIOLATION OF PROBATION OR COND DISCHG
52           VIOLATION OF PROBATION OR COND DISCHG
55             OPERATE UNDER INFLU OF LIQ OR DRUGS
58           VIOLATION OF PROBATION OR COND DISCHG
62           VIOLATION OF PROBATION OR COND DISCHG
64           VIOLATION OF PROBATION OR COND DISCHG
65                         POSSESSION OF NARCOTICS
70           VIOLATION OF PROBATION OR COND DISCHG
71           VIOLATION OF PROBATION OR COND DISCHG
72                                CRIMINAL ATTEMPT
82                         POSSESSION OF NARCOTICS
83           VIOLATION OF PROBA

In [37]:
print(list(detainees_2016_b.offense.loc[detainees_2016_b.offense.str.contains('CAPITAL')]))

['CAPITAL FELONY                         F', 'CAPITAL FELONY                         F', 'CAPITAL FELONY                         F', 'CAPITAL FELONY                         F', 'CAPITAL FELONY                         F', 'CAPITAL FELONY                         F', 'CAPITAL FELONY                         F', 'CAPITAL FELONY                         F', 'CAPITAL FELONY                         F', 'CAPITAL FELONY                         F', 'CAPITAL FELONY                         F', 'CAPITAL FELONY                         F', 'CAPITAL FELONY                         F', 'CAPITAL FELONY                         F', 'CAPITAL FELONY                         F', 'CAPITAL FELONY                         F', 'CAPITAL FELONY                         F', 'CAPITAL FELONY                         F', 'CAPITAL FELONY                         F', 'CAPITAL FELONY                         F', 'CAPITAL FELONY                         F', 'CAPITAL FELONY                         F', 'CAPITAL FELONY                

In [50]:
print((detainees_2016_b[['identifier', 'offense', 'latest_admission_date']].groupby(['identifier', 'offense']).count()))

                                                     latest_admission_date
identifier offense                                                        
ZZEBBBJW   INJURY OR RISK OF INJURY TO MINOR      F                     26
ZZEBBBSZ   THREATENING                           AM                      3
ZZEBBEWZ   ASSAULT, THIRD DEGREE                 AM                     13
ZZEBBHER   CRIMINAL TRESPASS, FIRST DEGREE       AM                     34
ZZEBBHWJ   CRIM VIOL OF PROTECTIVE ORDER         DF                     99
ZZEBBHWR   CRIM VIOL OF PROTECTIVE ORDER         DF                     24
ZZEBBRZC   FAILURE TO APPEAR, SECOND DEGREE      AM                      1
ZZEBCBHC   EVADING RESPONSIBILITY                 M                      5
ZZEBCCWL   CRIMINAL POSSESSION OF A PISTOL       DF                    126
ZZEBCHBZ   BREACH OF PEACE                       BM                      1
ZZEBCHHE   VIOLATION OF PROBATION OR COND DISCHG                        20
ZZEBCJZJ   FORGERY, FIRST

In [53]:
detainees_2016_07 = df.loc[(df['download_date'] >= '2016-07-01') & (df['download_date'] < '2016-08-01')]
detainees_2016_07.columns.values
print(detainees_2016_07.info())
detainees_2016_07.to_csv('detainees_2016_07.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103403 entries, 0 to 103403
Data columns (total 10 columns):
download_date            103403 non-null datetime64[ns]
identifier               103403 non-null object
latest_admission_date    103403 non-null datetime64[ns]
race                     103403 non-null category
gender                   103403 non-null category
age                      103403 non-null int64
bond_amount              103403 non-null int32
offense                  103403 non-null object
facility                 103403 non-null category
detainer                 103403 non-null category
dtypes: category(4), datetime64[ns](2), int32(1), int64(1), object(2)
memory usage: 5.5+ MB
None


In [54]:
detainees_2018_07 = df.loc[(df['download_date'] >= '2018-07-01') & (df['download_date'] < '2018-08-01')]
detainees_2018_07.columns.values
print(detainees_2018_07.info())
detainees_2018_07.to_csv('detainees_2018_07.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 108431 entries, 2202402 to 2310832
Data columns (total 10 columns):
download_date            108431 non-null datetime64[ns]
identifier               108431 non-null object
latest_admission_date    108431 non-null datetime64[ns]
race                     108431 non-null category
gender                   108431 non-null category
age                      108431 non-null int64
bond_amount              108431 non-null int32
offense                  108431 non-null object
facility                 108431 non-null category
detainer                 108431 non-null category
dtypes: category(4), datetime64[ns](2), int32(1), int64(1), object(2)
memory usage: 5.8+ MB
None
