In [1]:
# Data cleaning for CT Pretrial Detainees
# (Springboard Capstone 1)
# 2018, Misty M. Giles

%matplotlib inline
from collections import Counter
from matplotlib import pyplot as plt
from matplotlib import cm
import numpy as np
import pandas as pd

In [2]:
file = 'Accused_Pre-Trial_Inmates_in_Correctional_Facilities.csv'
#file = 'detainees_2016_07.csv'

# Read data in and update the columns to lower case and replace spaces with _s.
df = pd.read_csv(file, parse_dates=['DOWNLOAD DATE', 'LATEST ADMISSION DATE'])
df.columns = ['download_date', 'identifier', 'latest_admission_date', 'race', 
              'gender', 'age', 'bond_amount', 'offense', 'facility', 'detainer']

# Check the first rows to ensure that the data is expected.
print(df.head())

# Check .info() to see if the dtypes are appropriate.
print(df.info())

  download_date identifier latest_admission_date      race gender  age  \
0    2016-07-01   ZZEBBEWZ            2016-06-17     WHITE      M   63   
1    2016-07-01   ZZEBBRZC            2016-06-30     BLACK      M   42   
2    2016-07-01   ZZEBCBHC            2016-06-06     WHITE      M   60   
3    2016-07-01   ZZEBCCWL            2016-04-25  HISPANIC      M   44   
4    2016-07-01   ZZEBCHBZ            2016-06-21  HISPANIC      M   58   

   bond_amount                                   offense     facility detainer  
0         5000  ASSAULT, THIRD DEGREE                 AM  CORRIGAN CI     NONE  
1          500  FAILURE TO APPEAR, SECOND DEGREE      AM  HARTFORD CC     NONE  
2        40000  EVADING RESPONSIBILITY                 M  HARTFORD CC     NONE  
3       125000  CRIMINAL POSSESSION OF A PISTOL       DF  HARTFORD CC     NONE  
4         2500  BREACH OF PEACE                       BM  CORRIGAN CI     NONE  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2677123 entries, 0 

In [20]:
# Change race, gender, facility, and detainer to category type and verify.
df.race = df.race.astype('category')
df.gender = df.gender.astype('category')
df.facility = df.facility.astype('category')
df.detainer = df.detainer.astype('category')
# Separate the last two letters of the offense to get the offense class, if classed.
df['offense_class'] = [off[38:] if off.endswith(('AM','BM','CM','DM','UM',' M', 
                                                 'AF','BF','CF','DF','UF',' F')) 
                                else np.nan for off in df.offense]
#df['offense_class'] = [off[38:] for off in df.offense if off.endswith(('AM','BM','CM','DM','UM',' M'))]
df.offense_class = df.offense_class.astype('category')

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2677123 entries, 0 to 2677122
Data columns (total 12 columns):
download_date            datetime64[ns]
identifier               object
latest_admission_date    datetime64[ns]
race                     category
gender                   category
age                      int64
bond_amount              int64
offense                  object
facility                 category
detainer                 category
offense_class            category
days                     timedelta64[ns]
dtypes: category(5), datetime64[ns](2), int64(2), object(2), timedelta64[ns](1)
memory usage: 155.7+ MB
None


In [21]:
# Check the numbers of the categories. 
print(df.race.nunique())
print(df.gender.nunique())
print(df.facility.nunique())
print(df.detainer.nunique())
print(df.offense_class.nunique())

# Check the expected values of the categories.
# .tolist() used to show entire list as needed.
print(df.race.values.unique())
print(df.gender.values.unique())
print(df.facility.values.unique().tolist())
print(df.detainer.values.unique().tolist())
print(df.offense_class.values.unique().tolist())

print(df.count())

5
2
50
9
11
[WHITE, BLACK, HISPANIC, ASIAN, AMER IND]
Categories (5, object): [WHITE, BLACK, HISPANIC, ASIAN, AMER IND]
[M, F]
Categories (2, object): [M, F]
['CORRIGAN CI', 'HARTFORD CC', 'YORK CI', 'NEW HAVEN CC', 'BRIDGEPORT CC', 'GARNER', 'MACDOUGALL', 'WALKER RC', 'NORTHERN CI', 'CHESHIRE CI', 'OSBORN CI', 'MANSON YI', 'MH-WHITING', 'FEDERAL MARSHAL', 'D KIMBALL HSP', 'ROBINSON CI', 'ST MARYS HOSP', 'MH-CVH', 'ENFIELD CI', 'HARTFORD HOSP', 'WILLARD-CYBULSKI CI', 'UCONN HOSP', 'ST VINC HOSP', 'LAWRENCE HOSP', 'WATERBRY HOSP', 'MEDICAL HOSP', 'YALE/N.H.HOSP', 'JOHNSON HOSP', 'MH-YALE/N.H.', 'MH-ST MARY', 'BROOKLYN CI', 'DANBURY HOSP', 'BACKUS HOSP', 'RADGOWSKI', 'ST FRAN HOSP', 'PO-GARIBALDI', 'BRIDGEPORT HOSP', 'PO-WILLIAMS', 'PO-CARRINGTON', '4YV', 'ST RAPH HOSP', 'MH-HTFD HOSP', 'CONSIGN @ NY', 'MH-ST FRANCIS', 'MH-UCONN', 'PO-FERRARO', 'NORWALK HOSP', 'MH-ST RAPHAEL', 'MID-STATE HOSP', 'MER-WALL HOSP']
['NONE', 'SPECIAL PAROLE', 'FEDERAL', 'STATE OF CT', 'OTHER STATE', 'IMMIGRAT

In [22]:
# Do the time math for the days spent on each row.
df['days'] = df.download_date - df.latest_admission_date

In [23]:
print(df.bond_amount.max())
print(df.bond_amount.min())
print(df.bond_amount.median())

#print(df[['identifier', 'bond_amount', 'offense', 'detainer']].loc[df.offense.str.endswith('M')].sort_values(by='bond_amount', ascending=False).drop_duplicates().to_string())

9200000
1
90000.0


In [7]:
#df[['identifier', 'bond_amount', 'offense', 'latest_admission_date', 'download_date', 'detainer']].loc[df.identifier == 'ZZSRRWCL']

In [8]:
#df[['identifier', 'bond_amount', 'offense', 'latest_admission_date', 'download_date', 'detainer']].loc[df.identifier == 'ZZSEHRWS']

In [10]:
#color = cm.hsv(np.linspace(.9,.1, 100))
#df.identifier.value_counts()[:80].plot(title='Days per Identifier, whole period',
#                                kind='bar', color=color, rot=80);

In [11]:
#identifiers = df.groupby(['identifier', 'latest_admission_date'])
#identifiers = df.groupby(['identifier', 'latest_admission_date', 'offense', 'offense_class'])
#identifiers.offense.nunique()
#identifiers.bond_amount.nunique()
#identifiers.index.names

In [12]:
# This block was eliminated with the grouped_df construct.

#test = pd.DataFrame(identifiers.download_date.max())
#test['bond_amount'] = identifiers.bond_amount.median() # Won't work for the <$100 placeholders.  Filter?
#test['bond_changes'] = identifiers.bond_amount.nunique()
#test['days'] = identifiers.days.max()
#test['race'] = identifiers.race.nunique() 
#test['gender'] = NEED MODE.  COUNTER()?
#test['age'] = identifiers.age.min()  # Age at arrest, going on trust here.
#test['offense'] = NEED THE OFFENSE AS OF THE DOWNLOAD DATE
#test['offense_class'] = NEED LAST VALUE
#test['facility'] = DO I NEED THIS AT ALL?
#test['detainer'] = NEED LAST VALUE

#'race', 
#              'gender', 'age', 'bond_amount', 'offense', 'facility', 'detainer'


# Need to groupby identifier, latest_admission_date.  Then take the max download_date and fill the columns from that row.
# Example: ZZZWJZEB
# Also need to remove latest_admission_date < 2015(?) and remove those still awaiting trial on 2018-11-11.

#identifiers.index

In [13]:
#test.info()
#test

In [24]:
grouped_df = df.loc[df.groupby(['identifier', 'latest_admission_date'])['download_date'].idxmax()]

In [25]:
grouped_df.info()
grouped_df = grouped_df.sort_values(by='identifier')
grouped_df

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40147 entries, 2517248 to 730145
Data columns (total 12 columns):
download_date            40147 non-null datetime64[ns]
identifier               40147 non-null object
latest_admission_date    40147 non-null datetime64[ns]
race                     40147 non-null category
gender                   40147 non-null category
age                      40147 non-null int64
bond_amount              40147 non-null int64
offense                  40147 non-null object
facility                 40147 non-null category
detainer                 40147 non-null category
offense_class            28808 non-null category
days                     40147 non-null timedelta64[ns]
dtypes: category(5), datetime64[ns](2), int64(2), object(2), timedelta64[ns](1)
memory usage: 2.6+ MB


Unnamed: 0,download_date,identifier,latest_admission_date,race,gender,age,bond_amount,offense,facility,detainer,offense_class,days
2517248,2018-09-27,ZZEBBBCS,2018-08-21,WHITE,M,47,900000,"FAILURE TO APPEAR, FIRST DEGREE DF",NORTHERN CI,NONE,DF,37 days
248828,2016-10-21,ZZEBBBJW,2016-08-24,WHITE,M,44,15000,INJURY OR RISK OF INJURY TO MINOR F,NEW HAVEN CC,NONE,F,58 days
1090690,2017-07-13,ZZEBBBJW,2017-07-06,WHITE,M,44,100000,CRIM VIOL OF PROTECTIVE ORDER AM,HARTFORD CC,NONE,AM,7 days
2676743,2018-11-11,ZZEBBBJW,2018-09-27,WHITE,M,46,50000,VIOLATION OF PROBATION OR COND DISCHG,HARTFORD CC,NONE,,45 days
215673,2016-10-11,ZZEBBBSZ,2016-10-07,BLACK,F,57,2500,THREATENING AM,YORK CI,NONE,AM,4 days
1296934,2017-09-13,ZZEBBBZJ,2017-09-12,BLACK,M,43,20000,THREATENING AM,BRIDGEPORT CC,NONE,AM,1 days
2032338,2018-05-09,ZZEBBBZJ,2018-04-24,BLACK,M,44,150000,THREATENING AM,BRIDGEPORT CC,NONE,AM,15 days
655646,2017-03-01,ZZEBBBZL,2017-02-14,BLACK,M,44,1000,PROHIB ACTS RE: DRUG PARAPHERNALIA,BRIDGEPORT CC,NONE,,15 days
2109469,2018-06-01,ZZEBBCJC,2018-05-21,BLACK,M,44,5000,"BURGLARY, THIRD DEGREE DF",BRIDGEPORT CC,NONE,DF,11 days
2675707,2018-11-11,ZZEBBCRW,2018-10-10,WHITE,M,55,20000,"CRIMINAL TRESPASS, FIRST DEGREE AM",BRIDGEPORT CC,NONE,AM,32 days


In [36]:
classed = grouped_df.loc[grouped_df['offense_class'].notnull()]
classed.info()
classed.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28808 entries, 2517248 to 730145
Data columns (total 12 columns):
download_date            28808 non-null datetime64[ns]
identifier               28808 non-null object
latest_admission_date    28808 non-null datetime64[ns]
race                     28808 non-null category
gender                   28808 non-null category
age                      28808 non-null int64
bond_amount              28808 non-null int64
offense                  28808 non-null object
facility                 28808 non-null category
detainer                 28808 non-null category
offense_class            28808 non-null category
days                     28808 non-null timedelta64[ns]
dtypes: category(5), datetime64[ns](2), int64(2), object(2), timedelta64[ns](1)
memory usage: 1.9+ MB


Unnamed: 0,download_date,identifier,latest_admission_date,race,gender,age,bond_amount,offense,facility,detainer,offense_class,days
2517248,2018-09-27,ZZEBBBCS,2018-08-21,WHITE,M,47,900000,"FAILURE TO APPEAR, FIRST DEGREE DF",NORTHERN CI,NONE,DF,37 days
248828,2016-10-21,ZZEBBBJW,2016-08-24,WHITE,M,44,15000,INJURY OR RISK OF INJURY TO MINOR F,NEW HAVEN CC,NONE,F,58 days
1090690,2017-07-13,ZZEBBBJW,2017-07-06,WHITE,M,44,100000,CRIM VIOL OF PROTECTIVE ORDER AM,HARTFORD CC,NONE,AM,7 days
215673,2016-10-11,ZZEBBBSZ,2016-10-07,BLACK,F,57,2500,THREATENING AM,YORK CI,NONE,AM,4 days
1296934,2017-09-13,ZZEBBBZJ,2017-09-12,BLACK,M,43,20000,THREATENING AM,BRIDGEPORT CC,NONE,AM,1 days


In [37]:
not_classed = grouped_df.loc[grouped_df['offense_class'].isnull()]
not_classed.info()
not_classed.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11339 entries, 2676743 to 1858043
Data columns (total 12 columns):
download_date            11339 non-null datetime64[ns]
identifier               11339 non-null object
latest_admission_date    11339 non-null datetime64[ns]
race                     11339 non-null category
gender                   11339 non-null category
age                      11339 non-null int64
bond_amount              11339 non-null int64
offense                  11339 non-null object
facility                 11339 non-null category
detainer                 11339 non-null category
offense_class            0 non-null category
days                     11339 non-null timedelta64[ns]
dtypes: category(5), datetime64[ns](2), int64(2), object(2), timedelta64[ns](1)
memory usage: 768.0+ KB


Unnamed: 0,download_date,identifier,latest_admission_date,race,gender,age,bond_amount,offense,facility,detainer,offense_class,days
2676743,2018-11-11,ZZEBBBJW,2018-09-27,WHITE,M,46,50000,VIOLATION OF PROBATION OR COND DISCHG,HARTFORD CC,NONE,,45 days
655646,2017-03-01,ZZEBBBZL,2017-02-14,BLACK,M,44,1000,PROHIB ACTS RE: DRUG PARAPHERNALIA,BRIDGEPORT CC,NONE,,15 days
2075254,2018-05-22,ZZEBBEWB,2018-03-16,BLACK,M,48,25000,VIOLATION OF PROBATION OR COND DISCHG,BRIDGEPORT CC,NONE,,67 days
2441831,2018-09-06,ZZEBBEWB,2018-08-16,BLACK,M,48,3050,PROHIB ACTS RE: DRUG PARAPHERNALIA,BRIDGEPORT CC,NONE,,21 days
1081131,2017-07-11,ZZEBBHJE,2017-07-10,BLACK,M,50,900,VIOLATION OF PROBATION OR COND DISCHG,BRIDGEPORT CC,NONE,,1 days


In [38]:
print(classed.bond_amount.max(), 'max bond')
print(classed.bond_amount.min(), 'min bond')
print(classed.bond_amount.loc[classed.bond_amount < 100].count(), 'out of', len(classed), 'have bond < 100')
print(classed.bond_amount.median(), 'median bond')

7000000 max bond
1 min bond
93 out of 28808 have bond < 100
25400.0 median bond
