In [1]:
# Data cleaning for CT Pretrial Detainees
# (Springboard Capstone 1)
# 2018, Misty M. Giles

import numpy as np
import pandas as pd

In [2]:
# Accused...csv is the full 260MB file downloaded on 12 November 2018.  Replace 
# with a smaller file if you want to re-run this notebook in a reasonable
# time.
file = 'Accused_Pre-Trial_Inmates_in_Correctional_Facilities.csv'
#file = 'detainees_2016_07.csv'

# Read data in.  Update the columns to lower case and replace spaces with _s.
df = pd.read_csv(file, parse_dates=['DOWNLOAD DATE', 'LATEST ADMISSION DATE'])
df.columns = ['download_date', 'identifier', 'latest_admission_date', 'race', 
              'gender', 'age', 'bond_amount', 'offense', 'facility', 'detainer']

# Check the first rows to ensure that the data is expected.
df.head()

# Check .info() to see if the dtypes are appropriate.
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2677123 entries, 0 to 2677122
Data columns (total 10 columns):
download_date            datetime64[ns]
identifier               object
latest_admission_date    datetime64[ns]
race                     object
gender                   object
age                      int64
bond_amount              int64
offense                  object
facility                 object
detainer                 object
dtypes: datetime64[ns](2), int64(2), object(6)
memory usage: 204.2+ MB
None


In [3]:
# They weren't.  Change race, gender, facility, and detainer to category type and verify.
# Categorical variables chosen to speed up the processing slightly.
df.race = df.race.astype('category')
df.gender = df.gender.astype('category')
df.facility = df.facility.astype('category')
df.detainer = df.detainer.astype('category')

# Two more variables are needed: one to tell us felony or misdemeanor and the classification
# (if available) and one to tell us the number of days detained per arrest.  Setup is below.

# Separate the last two letters of the offense to get the offense class, if classed.
# If offense isn't classed, use np.nan to easily separate them out later.
df['offense_class'] = [offense[38:] if offense.endswith(('AM','BM','CM','DM','UM',' M', 
                       'AF','BF','CF','DF','UF',' F')) else np.nan for offense in df.offense]
# Set offense_class to categorical.
df.offense_class = df.offense_class.astype('category')
# Do the time math for the number of days each person spent per arrest.
df['days'] = df.download_date - df.latest_admission_date

# Verify 12 columns (original 10 + 2 above) and variable types.
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2677123 entries, 0 to 2677122
Data columns (total 12 columns):
download_date            datetime64[ns]
identifier               object
latest_admission_date    datetime64[ns]
race                     category
gender                   category
age                      int64
bond_amount              int64
offense                  object
facility                 category
detainer                 category
offense_class            category
days                     timedelta64[ns]
dtypes: category(5), datetime64[ns](2), int64(2), object(2), timedelta64[ns](1)
memory usage: 155.7+ MB
None


In [4]:
# Check the numbers of the categories against the number of expected values. 
print(df.race.nunique())
print(df.gender.nunique())
print(df.facility.nunique())
print(df.detainer.nunique())
print(df.offense_class.nunique())

# Check the expected values of the categories.  .tolist() used to show entire list as needed.
print(df.race.values.unique())
print(df.gender.values.unique())
print(df.facility.values.unique().tolist())
print(df.detainer.values.unique().tolist())
print(df.offense_class.values.unique().tolist())

# Check for any empty values.  Offense_class should contain some NaNs; other columns should not.
print(df.count())

5
2
50
9
11
[WHITE, BLACK, HISPANIC, ASIAN, AMER IND]
Categories (5, object): [WHITE, BLACK, HISPANIC, ASIAN, AMER IND]
[M, F]
Categories (2, object): [M, F]
['CORRIGAN CI', 'HARTFORD CC', 'YORK CI', 'NEW HAVEN CC', 'BRIDGEPORT CC', 'GARNER', 'MACDOUGALL', 'WALKER RC', 'NORTHERN CI', 'CHESHIRE CI', 'OSBORN CI', 'MANSON YI', 'MH-WHITING', 'FEDERAL MARSHAL', 'D KIMBALL HSP', 'ROBINSON CI', 'ST MARYS HOSP', 'MH-CVH', 'ENFIELD CI', 'HARTFORD HOSP', 'WILLARD-CYBULSKI CI', 'UCONN HOSP', 'ST VINC HOSP', 'LAWRENCE HOSP', 'WATERBRY HOSP', 'MEDICAL HOSP', 'YALE/N.H.HOSP', 'JOHNSON HOSP', 'MH-YALE/N.H.', 'MH-ST MARY', 'BROOKLYN CI', 'DANBURY HOSP', 'BACKUS HOSP', 'RADGOWSKI', 'ST FRAN HOSP', 'PO-GARIBALDI', 'BRIDGEPORT HOSP', 'PO-WILLIAMS', 'PO-CARRINGTON', '4YV', 'ST RAPH HOSP', 'MH-HTFD HOSP', 'CONSIGN @ NY', 'MH-ST FRANCIS', 'MH-UCONN', 'PO-FERRARO', 'NORWALK HOSP', 'MH-ST RAPHAEL', 'MID-STATE HOSP', 'MER-WALL HOSP']
['NONE', 'SPECIAL PAROLE', 'FEDERAL', 'STATE OF CT', 'OTHER STATE', 'IMMIGRAT

In [5]:
# Check the bond amounts.  
print(df.bond_amount.max())
print(df.bond_amount.min())
print(df.bond_amount.median())

#print(df[['identifier', 'bond_amount', 'offense', 'detainer']].loc[df.offense.str.endswith('M')].sort_values(by='bond_amount', ascending=False).drop_duplicates().to_string())

9200000
1
90000.0


In [6]:
#df[['identifier', 'bond_amount', 'offense', 'latest_admission_date', 'download_date', 'detainer']].loc[df.identifier == 'ZZSRRWCL']

In [7]:
#df[['identifier', 'bond_amount', 'offense', 'latest_admission_date', 'download_date', 'detainer']].loc[df.identifier == 'ZZSEHRWS']

In [8]:
# This block was eliminated with the grouped_df construct.  Still here for notes.  Remove for archive.

#test = pd.DataFrame(identifiers.download_date.max())
#test['bond_amount'] = identifiers.bond_amount.median() # Won't work for the <$100 placeholders.  Filter?
#test['bond_changes'] = identifiers.bond_amount.nunique()
#test['days'] = identifiers.days.max()
#test['race'] = identifiers.race.nunique() 
#test['gender'] = NEED MODE.  COUNTER()?
#test['age'] = identifiers.age.min()  # Age at arrest, going on trust here.
#test['offense'] = NEED THE OFFENSE AS OF THE DOWNLOAD DATE
#test['offense_class'] = NEED LAST VALUE
#test['facility'] = DO I NEED THIS AT ALL?
#test['detainer'] = NEED LAST VALUE


# Need to groupby identifier, latest_admission_date.  Then take the max download_date and fill the columns from that row.
# Example: ZZZWJZEB
# Also need to remove latest_admission_date < 2015(?) and remove those still awaiting trial on 2018-11-11.

#identifiers.index

In [9]:
# For each admission per identifier, get the last row of the data.  This 
# row (essentially, max(download_date)) contains the information from the day that 
# the detainee receives a verdict, is bailed out, or had charges dropped.  
# This row was chosen purposely as the least likely to have errors.  Bond amounts can 
# change during negotiations or as charges are changed or dropped.  This is a simplification
# to help meet deadlines.
grouped_df = df.loc[df.groupby(['identifier', 'latest_admission_date'])['download_date'].idxmax()]

In [10]:
# Verify the number of detainees (repeat offenders count per arrest) and that
# the datatypes/counts are still as expected.  Offense_class is only value that should
# contain NaNs.
grouped_df.info()
grouped_df = grouped_df.sort_values(by='identifier')
grouped_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40147 entries, 2517248 to 730145
Data columns (total 12 columns):
download_date            40147 non-null datetime64[ns]
identifier               40147 non-null object
latest_admission_date    40147 non-null datetime64[ns]
race                     40147 non-null category
gender                   40147 non-null category
age                      40147 non-null int64
bond_amount              40147 non-null int64
offense                  40147 non-null object
facility                 40147 non-null category
detainer                 40147 non-null category
offense_class            28808 non-null category
days                     40147 non-null timedelta64[ns]
dtypes: category(5), datetime64[ns](2), int64(2), object(2), timedelta64[ns](1)
memory usage: 2.6+ MB


Unnamed: 0,download_date,identifier,latest_admission_date,race,gender,age,bond_amount,offense,facility,detainer,offense_class,days
2517248,2018-09-27,ZZEBBBCS,2018-08-21,WHITE,M,47,900000,"FAILURE TO APPEAR, FIRST DEGREE DF",NORTHERN CI,NONE,DF,37 days
248828,2016-10-21,ZZEBBBJW,2016-08-24,WHITE,M,44,15000,INJURY OR RISK OF INJURY TO MINOR F,NEW HAVEN CC,NONE,F,58 days
1090690,2017-07-13,ZZEBBBJW,2017-07-06,WHITE,M,44,100000,CRIM VIOL OF PROTECTIVE ORDER AM,HARTFORD CC,NONE,AM,7 days
2676743,2018-11-11,ZZEBBBJW,2018-09-27,WHITE,M,46,50000,VIOLATION OF PROBATION OR COND DISCHG,HARTFORD CC,NONE,,45 days
215673,2016-10-11,ZZEBBBSZ,2016-10-07,BLACK,F,57,2500,THREATENING AM,YORK CI,NONE,AM,4 days


In [11]:
# Separate out the arrests where felony/misdemeanor is stated.  Again, print
# out the info.  All values should be equal.  No NaNs expected here.
classed = grouped_df.loc[grouped_df['offense_class'].notnull()]
classed.info()
classed.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28808 entries, 2517248 to 730145
Data columns (total 12 columns):
download_date            28808 non-null datetime64[ns]
identifier               28808 non-null object
latest_admission_date    28808 non-null datetime64[ns]
race                     28808 non-null category
gender                   28808 non-null category
age                      28808 non-null int64
bond_amount              28808 non-null int64
offense                  28808 non-null object
facility                 28808 non-null category
detainer                 28808 non-null category
offense_class            28808 non-null category
days                     28808 non-null timedelta64[ns]
dtypes: category(5), datetime64[ns](2), int64(2), object(2), timedelta64[ns](1)
memory usage: 1.9+ MB


Unnamed: 0,download_date,identifier,latest_admission_date,race,gender,age,bond_amount,offense,facility,detainer,offense_class,days
2517248,2018-09-27,ZZEBBBCS,2018-08-21,WHITE,M,47,900000,"FAILURE TO APPEAR, FIRST DEGREE DF",NORTHERN CI,NONE,DF,37 days
248828,2016-10-21,ZZEBBBJW,2016-08-24,WHITE,M,44,15000,INJURY OR RISK OF INJURY TO MINOR F,NEW HAVEN CC,NONE,F,58 days
1090690,2017-07-13,ZZEBBBJW,2017-07-06,WHITE,M,44,100000,CRIM VIOL OF PROTECTIVE ORDER AM,HARTFORD CC,NONE,AM,7 days
215673,2016-10-11,ZZEBBBSZ,2016-10-07,BLACK,F,57,2500,THREATENING AM,YORK CI,NONE,AM,4 days
1296934,2017-09-13,ZZEBBBZJ,2017-09-12,BLACK,M,43,20000,THREATENING AM,BRIDGEPORT CC,NONE,AM,1 days


In [12]:
# Separate out the arrests without felony/misdemeanor classifications.
# These are probably not useful for this project, but we'll retain them 
# just in case.
not_classed = grouped_df.loc[grouped_df['offense_class'].isnull()]
not_classed.info()
not_classed.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11339 entries, 2676743 to 1858043
Data columns (total 12 columns):
download_date            11339 non-null datetime64[ns]
identifier               11339 non-null object
latest_admission_date    11339 non-null datetime64[ns]
race                     11339 non-null category
gender                   11339 non-null category
age                      11339 non-null int64
bond_amount              11339 non-null int64
offense                  11339 non-null object
facility                 11339 non-null category
detainer                 11339 non-null category
offense_class            0 non-null category
days                     11339 non-null timedelta64[ns]
dtypes: category(5), datetime64[ns](2), int64(2), object(2), timedelta64[ns](1)
memory usage: 768.0+ KB


Unnamed: 0,download_date,identifier,latest_admission_date,race,gender,age,bond_amount,offense,facility,detainer,offense_class,days
2676743,2018-11-11,ZZEBBBJW,2018-09-27,WHITE,M,46,50000,VIOLATION OF PROBATION OR COND DISCHG,HARTFORD CC,NONE,,45 days
655646,2017-03-01,ZZEBBBZL,2017-02-14,BLACK,M,44,1000,PROHIB ACTS RE: DRUG PARAPHERNALIA,BRIDGEPORT CC,NONE,,15 days
2075254,2018-05-22,ZZEBBEWB,2018-03-16,BLACK,M,48,25000,VIOLATION OF PROBATION OR COND DISCHG,BRIDGEPORT CC,NONE,,67 days
2441831,2018-09-06,ZZEBBEWB,2018-08-16,BLACK,M,48,3050,PROHIB ACTS RE: DRUG PARAPHERNALIA,BRIDGEPORT CC,NONE,,21 days
1081131,2017-07-11,ZZEBBHJE,2017-07-10,BLACK,M,50,900,VIOLATION OF PROBATION OR COND DISCHG,BRIDGEPORT CC,NONE,,1 days


In [13]:
# Just for fun, check over the basic stats and see what we'll be 
# working with later.  CT says that bonds under 100 can be 
# considered placeholders, but I don't want to drop them yet.
print(classed.bond_amount.max(), 'max bond')
print(classed.bond_amount.min(), 'min bond')
print(classed.bond_amount.loc[classed.bond_amount < 100].count(), 'out of', len(classed), 'have bond < 100')
print(classed.bond_amount.median(), 'median bond')

7000000 max bond
1 min bond
93 out of 28808 have bond < 100
25400.0 median bond


In [14]:
# Save files to csv for unit 7, since it's a new notebook.
classed.to_csv('detainees_classed_offenses.csv', index=False)
not_classed.to_csv('detainees_unclassed_offenses.csv', index=False)

In [15]:
not_classed.offense.nunique()
not_classed.offense.unique().tolist()
print(not_classed['offense'].loc[not_classed['offense'] == 'VIOLATION OF PROBATION OR COND DISCHG'].count())

6757


In [16]:
print(len(classed)/classed.identifier.nunique())
print(classed.identifier.nunique())
#print(classed.latest_admission_date.loc[classed.latest_admission_date < '2014-01-01'])

1.2711468031593347
22663


In [17]:
print(len(df)/df.identifier.nunique())
print(df.identifier.nunique())
print(grouped_df.identifier.nunique())
print(not_classed.identifier.nunique())

90.6546679760252
29531
29531
9808


In [22]:
sort_date = df.loc[df.groupby(['identifier', 'latest_admission_date'])['download_date'].agg('count')]
sort_date['counts'] = sort_date.index
sort_date = sort_date.reset_index(drop=True)
sort_date = sort_date.loc[sort_date.latest_admission_date < '2014-01-01']

sort_date.info()
sort_date.loc[sort_date.counts > 300]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3221 entries, 13 to 40135
Data columns (total 13 columns):
download_date            3221 non-null datetime64[ns]
identifier               3221 non-null object
latest_admission_date    3221 non-null datetime64[ns]
race                     3221 non-null category
gender                   3221 non-null category
age                      3221 non-null int64
bond_amount              3221 non-null int64
offense                  3221 non-null object
facility                 3221 non-null category
detainer                 3221 non-null category
offense_class            2517 non-null category
days                     3221 non-null timedelta64[ns]
counts                   3221 non-null int64
dtypes: category(5), datetime64[ns](2), int64(3), object(2), timedelta64[ns](1)
memory usage: 246.2+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,download_date,identifier,latest_admission_date,race,gender,age,bond_amount,offense,facility,detainer,offense_class,days,counts
551,2016-07-01,ZZHELRRH,2010-06-28,BLACK,M,30,5000,INTERFERING WITH AN OFFICER AM,BRIDGEPORT CC,SPECIAL PAROLE,AM,2195 days,624
948,2016-07-01,ZZHHZSCJ,2013-11-13,BLACK,M,30,3500000,MURDER AF,NEW HAVEN CC,NONE,AF,961 days,774
949,2016-07-01,ZZHCZSZZ,2012-03-13,BLACK,M,25,2105000,CONSPIRACY,NORTHERN CI,NONE,,1571 days,583
1004,2016-07-01,ZZHJCCWL,2013-06-25,WHITE,M,37,990000,MURDER AF,GARNER,NONE,AF,1102 days,799
2298,2016-07-01,ZZHCEZEH,2013-12-23,HISPANIC,M,25,231110,FELONY MURDER AF,BRIDGEPORT CC,NONE,AF,921 days,487
2411,2016-07-01,ZZHJCCWL,2013-06-25,WHITE,M,37,990000,MURDER AF,GARNER,NONE,AF,1102 days,799
2504,2016-07-01,ZZHCBHCR,2010-08-17,BLACK,M,21,300000,ALTERING OR REMOVING IDENT NUMBER F,MANSON YI,SPECIAL PAROLE,F,2145 days,446
3769,2016-07-01,ZZHJCCWL,2013-06-25,WHITE,M,37,990000,MURDER AF,GARNER,NONE,AF,1102 days,799
5534,2016-07-01,ZZHCBELC,2011-06-03,WHITE,M,31,250000,"SEXUAL ASSAULT, THIRD DEGREE DF",MACDOUGALL,NONE,DF,1855 days,444
6487,2016-07-01,ZZHCEZEH,2013-12-23,HISPANIC,M,25,231110,FELONY MURDER AF,BRIDGEPORT CC,NONE,AF,921 days,487
