In [38]:
# Analysis for CT Pretrial Detainees
# (Springboard Capstone 1)
# 2019, Misty M. Giles

# Set up plotting  
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns

# ???  Am I using this?
from datetime import datetime as date

# Import standard packages
import numpy as np
import pandas as pd
import scipy.stats as stats

# Import misc tools
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# Import verification tools
from sklearn.cross_validation import cross_val_score

# Import preprocessing tools
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler

# Import models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier


# Select the file created in CT_csv (unit 5 data wrangling assignment)
file = 'detainees_classed_offenses.csv'

In [39]:
# Read in the file
df = pd.read_csv(file, parse_dates=['download_date', 'latest_admission_date'])

# Change from previous notebooks to drop the single case of 'UM' at the 
# beginning -- inclusion prevents use of 'stratify'.)
df['offense_class'] = [offense[-2:] if offense.endswith(('AM','BM','CM','DM',' M', 
                       'AF','BF','CF','DF',' F')) else np.nan for offense in df.offense]

# Days that CT says a detainee has been in the system.  This doesn't account for
# some detainees.  There are 730 days in the dataset, and the state says that entrance 
# dates over a year before could be original entrance date but are definitely not to be
# trusted.  The days column will be capped at 1,095 (730 + 365).  
df.days = [int(time[:-23]) for time in df.days]
df.days = [time if time <= 1095 else 1095 for time in df.days]

# Check that everything worked.  There should be 0 null values.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28808 entries, 0 to 28807
Data columns (total 13 columns):
download_date            28808 non-null datetime64[ns]
identifier               28808 non-null object
latest_admission_date    28808 non-null datetime64[ns]
race                     28808 non-null object
gender                   28808 non-null object
age                      28808 non-null int64
bond_amount              28808 non-null int64
offense                  28808 non-null object
facility                 28808 non-null object
detainer                 28808 non-null object
offense_class            28807 non-null object
days                     28808 non-null int64
counts                   28808 non-null int64
dtypes: datetime64[ns](2), int64(4), object(7)
memory usage: 2.9+ MB


In [40]:
# Only working with the 94% under 80k.
#df = df.loc[df.bond_amount<=80000]
#print(df.info())

# Separate out the two datasets, misdemeanor (primary) and felony.
misdemeanor_df = df.loc[df['offense_class'].str.endswith('M',na=False)].reset_index(
    drop=True).rename(index=str, columns={'offense_class_ M':'offense_class_M'})

# Change race, gender, facility, and detainer to category type and verify.
misdemeanor_df.race = misdemeanor_df.race.astype('category')
misdemeanor_df.gender = misdemeanor_df.gender.astype('category')
misdemeanor_df.facility = misdemeanor_df.facility.astype('category')
misdemeanor_df.detainer = misdemeanor_df.detainer.astype('category')
misdemeanor_df.offense_class = misdemeanor_df.offense_class.astype('category')

# Divide the misdemeanor data into pre-enactment and post.
mis_pre_df = misdemeanor_df.loc[misdemeanor_df.download_date < '2017-07-01']
mis_post_df = misdemeanor_df.loc[misdemeanor_df.download_date >= '2017-07-01']
mis_post_df = mis_post_df.loc[mis_post_df.download_date < '2018-07-01']
mis_outofbounds_df = misdemeanor_df.loc[misdemeanor_df.download_date >= '2018-07-01']
mis_df = pd.concat([mis_pre_df, mis_post_df])

# Check that everything worked.  There should be 0 null values.
misdemeanor_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9663 entries, 0 to 9662
Data columns (total 13 columns):
download_date            9663 non-null datetime64[ns]
identifier               9663 non-null object
latest_admission_date    9663 non-null datetime64[ns]
race                     9663 non-null category
gender                   9663 non-null category
age                      9663 non-null int64
bond_amount              9663 non-null int64
offense                  9663 non-null object
facility                 9663 non-null category
detainer                 9663 non-null category
offense_class            9663 non-null category
days                     9663 non-null int64
counts                   9663 non-null int64
dtypes: category(5), datetime64[ns](2), int64(4), object(2)
memory usage: 1.0+ MB


### Gender categorization:
1. Define data and target from the pre-enactment dataframe
2. Split, preprocess, and fit
3. Test the accuracy on pre-enactment test data
4. See what happens with the post-enactment data

In [41]:
# Define data and target
X = mis_pre_df[['bond_amount','days','age']]
y = mis_pre_df.gender.ravel()

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.3,stratify=y)

In [42]:
# Define data and target
X = mis_pre_df[['bond_amount','days','age']]
y = mis_pre_df.gender.ravel()

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.3,stratify=y)

# Rescale the data
scaler = MinMaxScaler()

# Classifier
clf = LogisticRegression()

pl = Pipeline([
        ('scaler', scaler),
        ('clf', clf)
    ])

pl.fit(X_train,y_train)
print("Accuracy, gender (training): %0.4f" % (pl.score(X_train,y_train)))

# Print the accuracy
scores = cross_val_score(pl, X_test, y_test, cv=5) 
print("Accuracy, gender (test): %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

Accuracy, gender (training): 0.8273
Accuracy, gender (test): 0.8269 (+/- 0.0035)


In [43]:
# Define data and target
X = mis_pre_df[['bond_amount','days','age']]
y = mis_pre_df.gender.ravel()

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.3,stratify=y)

# Rescale the data
scaler = StandardScaler()

# Classifier
clf = LogisticRegression()

pl = Pipeline([
        ('scaler', scaler),
        ('clf', clf)
    ])

pl.fit(X_train,y_train)
print("Accuracy, gender (training): %0.4f" % (pl.score(X_train,y_train)))

# Print the accuracy
scores = cross_val_score(pl, X_test, y_test, cv=5) 
print("Accuracy, gender (test): %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

Accuracy, gender (training): 0.8273
Accuracy, gender (test): 0.8269 (+/- 0.0035)


In [44]:
# Define data and target
X = mis_pre_df[['bond_amount','days','age']]
y = mis_pre_df.gender.ravel()

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.3,stratify=y)

# Rescale the data
scaler = Normalizer()

# Classifier
clf = LogisticRegression()

pl = Pipeline([
        ('scaler', scaler),
        ('clf', clf)
    ])

pl.fit(X_train,y_train)
print("Accuracy, gender (training): %0.4f" % (pl.score(X_train,y_train)))

# Print the accuracy
scores = cross_val_score(pl, X_test, y_test, cv=5) 
print("Accuracy, gender (test): %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

Accuracy, gender (training): 0.8273
Accuracy, gender (test): 0.8278 (+/- 0.0030)


### Offense class and race q&d below.  The numbers are too similar in the gender for me to figure out if I'm doing something wrong.

In [45]:
# Define data and target
X = mis_pre_df[['bond_amount','days','age']]
y = mis_pre_df.offense_class.ravel()

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.3,stratify=y)

# Rescale the data
scaler = MinMaxScaler()

# Classifier
clf = LogisticRegression()

pl = Pipeline([
        ('scaler', scaler),
        ('clf', clf)
    ])

pl.fit(X_train,y_train)
print("Accuracy, offense class (training): %0.4f" % (pl.score(X_train,y_train)))

# Print the accuracy
scores = cross_val_score(pl, X_test, y_test, cv=5) 
print("Accuracy, offense class (test): %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

Accuracy, offense class (training): 0.7438
Accuracy, offense class (test): 0.7438 (+/- 0.0065)


In [46]:
# Define data and target
X = mis_pre_df[['bond_amount','days','age']]
y = mis_pre_df.race.ravel()

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.3,stratify=y)

# Rescale the data
scaler = MinMaxScaler()

# Classifier
clf = LogisticRegression()

pl = Pipeline([
        ('scaler', scaler),
        ('clf', clf)
    ])

pl.fit(X_train,y_train)
print("Accuracy, race (training): %0.4f" % (pl.score(X_train,y_train)))

# Print the accuracy
scores = cross_val_score(pl, X_test, y_test, cv=5) 
print("Accuracy, race (test): %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

Accuracy, race (training): 0.4270
Accuracy, race (test): 0.4267 (+/- 0.0072)
