In [1]:
# Inferential Statistics for CT Pretrial Detainees
# (Springboard Capstone 1)
# 2019, Misty M. Giles

# Import everything.  
%matplotlib inline
from datetime import datetime as date
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns

# Select the file created in CT_csv (unit 5 data wrangling assignment)
file = 'detainees_classed_offenses.csv'

In [2]:
# Read in the file
df = pd.read_csv(file, parse_dates=['download_date', 'latest_admission_date'])

# This corrects a typo that's being fixed in CT_csv.
df['offense_class'] = [offense[-2:] if offense.endswith(('AM','BM','CM','DM','UM',' M', 
                       'AF','BF','CF','DF','UF',' F')) else np.nan for offense in df.offense]

# Days that CT says a detainee has been in the system.  This doesn't account for
# some detainees.  There are 730 days in the dataset, and the state says that entrance 
# dates over a year before could be original entrance date but are definitely not to be
# trusted.  The days column will be capped at 1,095 (730 + 365).  This is definitely going
#  to need further investigation.
df.days = [int(time[:-23]) for time in df.days]
df.days = [time if time <= 1095 else 1095 for time in df.days]

# Create age ranges of decades.  For some reason, decades can't be used as a column header.
df['tens'] = [int((age // 10) * 10) for age in df.age] 

# Check that everything worked.  In this EDA, there should be 0 null values.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28808 entries, 0 to 28807
Data columns (total 14 columns):
download_date            28808 non-null datetime64[ns]
identifier               28808 non-null object
latest_admission_date    28808 non-null datetime64[ns]
race                     28808 non-null object
gender                   28808 non-null object
age                      28808 non-null int64
bond_amount              28808 non-null int64
offense                  28808 non-null object
facility                 28808 non-null object
detainer                 28808 non-null object
offense_class            28808 non-null object
days                     28808 non-null int64
counts                   28808 non-null int64
tens                     28808 non-null int64
dtypes: datetime64[ns](2), int64(5), object(7)
memory usage: 3.1+ MB


In [3]:
# Separate out the two datasets, misdemeanor (primary) and felony.
felony_df = df.loc[df['offense_class'].str.endswith('F')]
misdemeanor_df = df.loc[df['offense_class'].str.endswith('M')]

# Divide the misdemeanor data into pre-enactment and post.
date_pre = pd.date_range(start='7/1/2016', end='6/30/2017', freq='D').tolist()
date_post = pd.date_range(start='7/1/2017', end='6/30/2018', freq='D').tolist()
date_end = pd.date_range(start='7/1/2018', end='11/30/2018', freq='D').tolist()
mis_pre_df = misdemeanor_df.loc[misdemeanor_df.download_date < '2017-07-01']
mis_post_df = misdemeanor_df.loc[misdemeanor_df.download_date >= '2017-07-01']
mis_post_df = mis_post_df.loc[mis_post_df.download_date < '2018-07-01']
mis_outofbounds_df = misdemeanor_df.loc[misdemeanor_df.download_date >= '2018-07-01']
mis_df = pd.concat([mis_pre_df, mis_post_df])

# Bin edges to more easily see age differences.
bin_edges = [10, 20, 30, 40, 50, 60, 70, 80, 90] 

# Bin edges for non-age plots.  Sqrt to avoid 'bin bias.'
bins_sqrt = int(np.sqrt(len(misdemeanor_df.bond_amount))) 

# Set up an ecdf function to quickly see distributions.
def ecdf(data):
    '''Compute continuous distribution function for one column'''
    n = len(data)
    x = np.sort(data)
    y = np.arange(1, n+1) / n
    return x, y

# Set up a function to generate bootstrap replicates
def bs_reps(data, func, size=1):
    '''Draw boostrap replicates'''
    # Initialize empty array
    bs_replicates = np.empty(size)
    # Generate replicates
    for i in range(size):
        bs_replicates[i] = func(np.random.choice(data, size=len(data)))
    return bs_replicates

# Set up a function to permute data sets
def permutation_sample(data1, data2):
    '''Generate a permuted sample from two data sets'''
    # Concatenate the data sets
    data = np.concatenate((data1, data2))
    # Permute the concatenated array
    permuted_data = np.random.permutation(data)
    # Split the permuted data back into two data sets
    perm_sample_1 = permuted_data[:len(data1)]
    perm_sample_2 = permuted_data[len(data1):]
    return perm_sample_1, perm_sample_2

# Set up a function to generate permutation replicates
def draw_perm_reps(data1, data2, func, size=1):
    '''Draw permutation replicates'''
    # Initialize empty array of proper size
    perm_replicates = np.empty(size)
    for i in range(size):
        # Generate permutation sample
        perm_sample_1, perm_sample_2 = permutation_sample(data1, data2)
        # Compute the test statistic
        perm_replicates[i] = func(perm_sample_1, perm_sample_2)
    return perm_replicates

# Set up a function to calculate differences of means
def diff_of_means(data1, data2):
    '''Calculate difference of means of two arrays'''
    diff = np.mean(data1) - np.mean(data2)
    return diff

def num_means(column):
    '''Get the statistics, mean, and CI for a column'''
    # Get statsitics for column
    m = np.mean(column) # mean of sample
    n = len(column) # sample size
    s = np.std(column) # stdev of sample
    sem = s / np.sqrt(n) # sample error
    
    # Get several bootstrap replicates for column
    bs_replicates = bs_reps(column, np.mean, 10000)
    
    # Get confidence interval
    bootstrap_ci = np.percentile(bs_replicates, [2.5, 97.5])
    
    return bs_replicates, bootstrap_ci

def ptest(column1, column2, test_col):
    '''Get p-value for bootstraps'''
    reps1, ci1 = num_means(column1)
    reps2, ci2 = num_means(column2)
    
    #pval_bs = np.sum((reps1 >= np.mean(test_col)) + (reps2 >= np.mean(test_col))) / (len(reps1) + len(reps2))
    pval_bs1 = np.sum((reps1 >= np.mean(test_col))) / len(reps1)
    pval_bs2 = np.sum((reps2 >= np.mean(test_col))) / len(reps2)
    
    # Print the test results and stats
    print('Bootstrap p-value 1 = ', '%.4f' % pval_bs1)
    print('Bootstrap p-value 2 = ', '%.4f' % pval_bs2)
    print('Mean of observed data     = ', np.mean(test_col))
    print('Bootstrap mean of column1 = ', np.mean(reps1))
    print('Bootstrap mean of column2 = ', np.mean(reps2))
    print('Bootstrap CI column1:\t', ci1[0], ',', ci1[1])
    print('Bootstrap CI column2:\t', ci2[0], ',', ci2[1])

# And for reference, the number of bins in non-age plots:
bins_sqrt

# Seed for repeatability
np.random.seed(42)

In [4]:
mis_pre_df.head()

Unnamed: 0,download_date,identifier,latest_admission_date,race,gender,age,bond_amount,offense,facility,detainer,offense_class,days,counts,tens
3,2016-10-11,ZZEBBBSZ,2016-10-07,BLACK,F,57,2500,THREATENING AM,YORK CI,NONE,AM,4,3,50
13,2016-07-13,ZZEBBEWZ,2016-06-17,WHITE,M,63,5000,"ASSAULT, THIRD DEGREE AM",CORRIGAN CI,NONE,AM,26,13,60
14,2017-04-17,ZZEBBHER,2017-01-30,WHITE,M,50,19050,"CRIMINAL TRESPASS, FIRST DEGREE AM",HARTFORD CC,NONE,AM,77,77,50
15,2016-12-06,ZZEBBHER,2016-11-09,WHITE,M,50,500,"CRIMINAL TRESPASS, FIRST DEGREE AM",HARTFORD CC,NONE,AM,27,18,50
16,2017-01-05,ZZEBBHER,2016-12-15,WHITE,M,50,10250,"CRIMINAL TRESPASS, FIRST DEGREE AM",HARTFORD CC,NONE,AM,21,21,50


In [5]:
mis_post_df.head()

Unnamed: 0,download_date,identifier,latest_admission_date,race,gender,age,bond_amount,offense,facility,detainer,offense_class,days,counts,tens
2,2017-07-13,ZZEBBBJW,2017-07-06,WHITE,M,44,100000,CRIM VIOL OF PROTECTIVE ORDER AM,HARTFORD CC,NONE,AM,7,7,40
4,2017-09-13,ZZEBBBZJ,2017-09-12,BLACK,M,43,20000,THREATENING AM,BRIDGEPORT CC,NONE,AM,1,1,40
5,2018-05-09,ZZEBBBZJ,2018-04-24,BLACK,M,44,150000,THREATENING AM,BRIDGEPORT CC,NONE,AM,15,15,40
8,2018-01-02,ZZEBBCRZ,2017-12-29,WHITE,M,47,2500,"ASSAULT, THIRD DEGREE AM",NEW HAVEN CC,NONE,AM,4,4,40
9,2018-03-22,ZZEBBCRZ,2018-03-21,HISPANIC,M,47,2500,CRIM VIOL OF PROTECTIVE ORDER AM,NEW HAVEN CC,NONE,AM,1,1,40


In [6]:
mis_outofbounds_df.head()

Unnamed: 0,download_date,identifier,latest_admission_date,race,gender,age,bond_amount,offense,facility,detainer,offense_class,days,counts,tens
7,2018-11-11,ZZEBBCRW,2018-10-10,WHITE,M,55,20000,"CRIMINAL TRESPASS, FIRST DEGREE AM",BRIDGEPORT CC,NONE,AM,32,32,50
11,2018-07-23,ZZEBBEBS,2018-07-09,HISPANIC,M,46,5000,"FAILURE TO APPEAR, SECOND DEGREE AM",HARTFORD CC,NONE,AM,14,14,40
27,2018-09-24,ZZEBBSBS,2018-08-15,BLACK,M,56,2100,DISORDERLY CONDUCT CM,HARTFORD CC,NONE,CM,40,40,50
28,2018-11-11,ZZEBBSLH,2018-05-14,WHITE,M,56,41000,ILL OP MV WO IGNITION DEVICE CM,CORRIGAN CI,NONE,CM,181,179,50
35,2018-08-16,ZZEBCBSZ,2018-07-18,HISPANIC,M,53,20000,"CRIMINAL TRESPASS, FIRST DEGREE AM",BRIDGEPORT CC,NONE,AM,29,29,50


In [7]:
mis_pre_df.shape, mis_post_df.shape, mis_outofbounds_df.shape, mis_df.shape

((3889, 14), (3821, 14), (1954, 14), (7710, 14))

In [8]:
misdemeanor_df.shape

(9664, 14)

In [9]:
mis_pre_df.describe()

Unnamed: 0,age,bond_amount,days,counts,tens
count,3889.0,3889.0,3889.0,3889.0,3889.0
mean,36.168167,21478.43,59.048084,27.860375,31.766521
std,11.362481,45217.67,154.314135,35.266749,11.58885
min,18.0,1.0,1.0,1.0,10.0
25%,27.0,2500.0,4.0,3.0,20.0
50%,34.0,7500.0,22.0,16.0,30.0
75%,45.0,25000.0,54.0,38.0,40.0
max,75.0,1025000.0,1095.0,287.0,70.0


Okay, now that everything is set up, I'm going to start looking for the variables of interest: gender, race, age, offense_class, days, and bond_amount.  Are any of these numeric variables normal?

In [10]:
chisq, pval = stats.normaltest(mis_pre_df.age)
if pval < .05:
    print('Normal distribution IS NOT a good fit for age.  (p-value = ', pval, ')')
else:
    print('Normal distribution IS a good fit for age.  (p-value = ', pval, ')')
    
chisq, pval = stats.normaltest(mis_pre_df.days)
if pval < .05:
    print('Normal distribution IS NOT a good fit for days.  (p-value = ', pval, ')')
else:
    print('Normal distribution IS a good fit for days.  (p-value = ', pval, ')')
    
chisq, pval = stats.normaltest(mis_pre_df.bond_amount)
if pval < .05:
    print('Normal distribution IS NOT a good fit for bond.  (p-value = ', pval, ')')
else:
    print('Normal distribution IS a good fit for bond.  (p-value = ', pval, ')')

Normal distribution IS NOT a good fit for age.  (p-value =  2.557558769554361e-57 )
Normal distribution IS NOT a good fit for days.  (p-value =  0.0 )
Normal distribution IS NOT a good fit for bond.  (p-value =  0.0 )


##### First set of tests: gender

Below we'll look at the bond and days means and see who is expected to stay longer and pay more.

For both variables:
* H0: male == female
* Ha: male != female 
* alpha = .05

For both bond amounts and days detained, gender is a contributing factor.  H0 must be rejected in favor of Ha.

In [11]:
# Determine the populations
female = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.gender == 'F']
male = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.gender == 'M']

female.describe()

Unnamed: 0,bond_amount,days
count,672.0,672.0
mean,13778.532738,32.77381
std,30611.955119,84.638423
min,1.0,1.0
25%,1500.0,3.0
50%,5000.0,17.0
75%,14000.0,35.0
max,500000.0,1095.0


In [12]:
male.describe()

Unnamed: 0,bond_amount,days
count,3217.0,3217.0
mean,23086.86,64.536525
std,47553.97,164.680397
min,1.0,1.0
25%,2500.0,4.0
50%,10000.0,24.0
75%,25000.0,59.0
max,1025000.0,1095.0


In [13]:
# Set up bond amount dfs
mis1 = mis_pre_df[['bond_amount', 'gender']].copy()
mis1['g'] = mis_pre_df[['gender']].copy()
mis1 = pd.get_dummies(mis1, columns=['g']).groupby('gender').sum()
mis2 = mis_post_df[['bond_amount', 'gender']].copy()
mis2['g'] = mis_post_df[['gender']].copy()
mis2 = pd.get_dummies(mis2, columns=['g']).groupby('gender').sum()

stat, p, dof, expected = stats.chi2_contingency(mis2, mis1)

# interpret test-statistic
prob = 0.95
critical = stats.chi2.ppf(prob, dof)
if abs(stat) >= critical:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')
alpha = 1.0 - prob
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')
    
# Set up length of stay dfs    
mis1 = mis_pre_df[['days', 'gender']].copy()
mis1['g'] = mis_pre_df[['gender']].copy()
mis1 = pd.get_dummies(mis1, columns=['g']).groupby('gender').sum()
mis2 = mis_post_df[['days', 'gender']].copy()
mis2['g'] = mis_post_df[['gender']].copy()
mis2 = pd.get_dummies(mis2, columns=['g']).groupby('gender').sum()

stat, p, dof, expected = stats.chi2_contingency(mis2, mis1)

# interpret test-statistic
prob = 0.95
critical = stats.chi2.ppf(prob, dof)
if abs(stat) >= critical:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')
alpha = 1.0 - prob
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')

Dependent (reject H0)
Dependent (reject H0)
Dependent (reject H0)
Dependent (reject H0)


In [14]:
# Is the gender makeup of the system equivalent?

mis1 = mis_pre_df[['bond_amount', 'days', 'gender']].copy()
mis1['g'] = mis_pre_df[['gender']].copy()
mis1 = pd.get_dummies(mis1, columns=['g']).groupby('gender').sum()
mis2 = mis_post_df[['bond_amount', 'days', 'gender']].copy()
mis2['g'] = mis_post_df[['gender']].copy()
mis2 = pd.get_dummies(mis2, columns=['g']).groupby('gender').sum()

year1 = mis1.drop(['bond_amount', 'days'], axis=1)
year2 = mis2.drop(['bond_amount', 'days'], axis=1)

# Is there a difference between gender makeup between the first year and second?  No.  p = .83, fail to reject H0.

female1 = len(mis_pre_df[mis_pre_df.gender=='F'])
male1 = len(mis_pre_df[mis_pre_df.gender=='M'])
total1 = len(mis_pre_df.gender)
female2 = len(mis_post_df[mis_post_df.gender=='F'])
male2 = len(mis_post_df[mis_post_df.gender=='M'])
total2 = len(mis_post_df.gender)
p_hat = (female1+female2)/(total1+total2)
p1 = female1/total1
p2 = female2/total2
tail = 2

observed_diff = p1-p2

z = (observed_diff-0) / np.sqrt(p_hat*(1-p_hat)*((1/total1)+(1/total2)))
z_crit = stats.norm.ppf(.95)
moe = z_crit*np.sqrt((p1/total1)+(p2/total2))
ci_high = observed_diff + moe
ci_low = observed_diff - moe
z_pval = (1-stats.norm.cdf(z))*tail
    
print('The margin of error is', '{:.7f}'.format(moe), '(', '{:.2f}'.format(moe*100), 'percent )')
print('The 95% confidence interval is', '{:.7f}'.format(ci_low), 'to', '{:.7f}'.format(ci_high), '(','{:.2f}'.format(ci_low*100), 'to', '{:.2f}'.format(ci_high*100), 'percent difference )')
print('The p-value is', z_pval, '(', '{:.7f}'.format(z_pval), ')')


The margin of error is 0.0155313 ( 1.55 percent )
The 95% confidence interval is -0.0136339 to 0.0174286 ( -1.36 to 1.74 percent difference )
The p-value is 0.8252464460130775 ( 0.8252464 )


##### Second set of tests: race

Below we'll again look at the bond and days means and see who is expected to stay longer and pay more.

For both variables: 
* H0: white == minority 
* Ha: white != minority
* alpha = .05



In [15]:
# Determine the populations
race_df_pre = mis_pre_df[['bond_amount', 'days', 'race', 'gender']].copy()
race_df_pre['r'] = mis_pre_df[['race']].copy()
race_df_pre = pd.get_dummies(race_df_pre, columns=['r']).groupby('gender').sum()
race_df_post = mis_post_df[['bond_amount', 'days', 'race', 'gender']].copy()
race_df_post['r'] = mis_post_df[['race']].copy()
race_df_post = pd.get_dummies(race_df_post, columns=['r']).groupby('gender').sum()
white = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.race == 'WHITE']
minority = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.race != 'WHITE']
black = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.race == 'BLACK']
hispanic = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.race == 'HISPANIC']
asian = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.race == 'ASIAN']
amer_ind = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.race == 'AMER IND']
white_post = mis_post_df[['bond_amount', 'days']].loc[mis_post_df.race == 'WHITE']
minority_post = mis_post_df[['bond_amount', 'days']].loc[mis_post_df.race != 'WHITE']
black_post = mis_post_df[['bond_amount', 'days']].loc[mis_post_df.race == 'BLACK']
hispanic_post = mis_post_df[['bond_amount', 'days']].loc[mis_post_df.race == 'HISPANIC']
asian_post = mis_post_df[['bond_amount', 'days']].loc[mis_post_df.race == 'ASIAN']
amer_ind_post = mis_post_df[['bond_amount', 'days']].loc[mis_post_df.race == 'AMER IND']

In [16]:
race_df_pre.head()

Unnamed: 0_level_0,bond_amount,days,r_AMER IND,r_ASIAN,r_BLACK,r_HISPANIC,r_WHITE
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
F,9259174,22024,8.0,8.0,206.0,91.0,359.0
M,74270439,207614,9.0,27.0,1069.0,836.0,1276.0


In [17]:
race_df_post.head()

Unnamed: 0_level_0,bond_amount,days,r_AMER IND,r_ASIAN,r_BLACK,r_HISPANIC,r_WHITE
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
F,8877875,21070,3.0,5.0,160.0,97.0,388.0
M,78852025,211868,9.0,15.0,1059.0,788.0,1297.0


In [18]:
# Are the races equally represented during the two years?

year1 = race_df_pre.drop(['bond_amount', 'days'], axis=1)
year2 = race_df_post.drop(['bond_amount', 'days'], axis=1)

stat, p, dof, expected = stats.chi2_contingency(year2, year1)

# interpret test-statistic
prob = 0.95
critical = stats.chi2.ppf(prob, dof)
if abs(stat) >= critical:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')
alpha = 1.0 - prob
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')
    
stat, p, dof, expected, alpha, critical

Dependent (reject H0)
Dependent (reject H0)


(80.31453777483422,
 1.494053888028035e-16,
 4,
 array([[   2.05077205,    3.41795342,  208.32426066,  151.24443863,
          287.96257524],
        [   9.94922795,   16.58204658, 1010.67573934,  733.75556137,
         1397.03742476]]),
 0.050000000000000044,
 9.487729036781154)

In [19]:
# Is there a difference between racial makeup between the first year and second?

# According to this test, there is.  
# Fail to reject a change in the populations for American Indian, Hispanic, and White.
# Reject (there's a change) for the Asian and Black populations.

chi, p = stats.chisquare(year2, year1)
chi, p

(array([ 3.125     ,  6.45833333, 10.36539003,  3.15158526,  2.68822967]),
 array([0.07709987, 0.01104332, 0.001284  , 0.07585324, 0.10109207]))

##### Third set of tests: offense class

Below we'll again look at the bond and days means and see if the class of offense affects bond amount or days detained.

For both variables: 
* H0: offense class makeup is equivalent
* Ha: it's not

In [20]:
# Set up the populations
offense_df_pre = mis_pre_df[['bond_amount', 'days', 'offense_class', 'gender']].copy()
offense_df_pre['class'] = mis_pre_df[['offense_class']].copy()
offense_df_pre = pd.get_dummies(offense_df_pre, columns=['class']).groupby('gender').sum()
offense_df_post = mis_post_df[['bond_amount', 'days', 'offense_class', 'gender']].copy()
offense_df_post['class'] = mis_post_df[['offense_class']].copy()
offense_df_post = pd.get_dummies(offense_df_post, columns=['class']).groupby('gender').sum()
offense_am = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.offense_class == 'AM']
offense_bm = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.offense_class == 'BM']
offense_cm = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.offense_class == 'CM']
offense_dm = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.offense_class == 'DM']
offense_um = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.offense_class == 'UM']
offense_m  = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.offense_class == ' M']

In [21]:
offense_df_pre.head()

Unnamed: 0_level_0,bond_amount,days,class_ M,class_AM,class_BM,class_CM,class_UM
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
F,9259174,22024,29.0,539.0,47.0,57.0,0.0
M,74270439,207614,201.0,2353.0,264.0,398.0,1.0


In [22]:
offense_df_post.head()

Unnamed: 0_level_0,bond_amount,days,class_ M,class_AM,class_BM,class_CM
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
F,8877875,21070,26.0,531.0,45.0,51.0
M,78852025,211868,199.0,2493.0,196.0,280.0


In [23]:
# Is there a difference between offense class makeup between the first year and second?

year1 = offense_df_pre.drop(['bond_amount', 'days', 'class_UM'], axis=1)
year2 = offense_df_post.drop(['bond_amount', 'days'], axis=1)

stat, p, dof, expected = stats.chi2_contingency(year2)

# interpret test-statistic
prob = 0.95
critical = stats.chi2.ppf(prob, dof)
if abs(stat) >= critical:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')
alpha = 1.0 - prob
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')
    
stat, p, dof, expected, alpha, critical

Independent (fail to reject H0)
Independent (fail to reject H0)


(6.421214566098131,
 0.09282193107044827,
 3,
 array([[  38.45197592,  516.7945564 ,   41.18633865,   56.56712902],
        [ 186.54802408, 2507.2054436 ,  199.81366135,  274.43287098]]),
 0.050000000000000044,
 7.814727903251179)

In [24]:
# Is there a difference between offense class makeup between the first year and second?

# According to this test, there is.  
# Reject (there's a change) for the makeup of the offense classes except for the
# unclassed ('M') offenses.  Fail to reject (no change) for 'M'.

chi, p = stats.chisquare(year2, year1)
chi, p

(array([ 0.33024533,  8.44853016, 17.6002579 , 35.61650357]),
 array([5.65514667e-01, 3.65338586e-03, 2.72551383e-05, 2.40243544e-09]))

In [25]:
# Break out the offense classes into race

offense_df_pre = mis_pre_df[['bond_amount', 'days', 'offense_class', 'race']].copy()
offense_df_pre['class'] = mis_pre_df[['offense_class']].copy()
offense_df_pre = pd.get_dummies(offense_df_pre, columns=['class']).groupby('race').sum().drop(['AMER IND', 'ASIAN'], axis=0).drop(['class_UM'], axis=1)
offense_df_post = mis_post_df[['bond_amount', 'days', 'offense_class', 'race']].copy()
offense_df_post['class'] = mis_post_df[['offense_class']].copy()
offense_df_post = pd.get_dummies(offense_df_post, columns=['class']).groupby('race').sum().drop(['AMER IND', 'ASIAN'], axis=0)

In [26]:
offense_df_pre.head()

Unnamed: 0_level_0,bond_amount,days,class_ M,class_AM,class_BM,class_CM
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BLACK,29092615,89690,57.0,998.0,105.0,115.0
HISPANIC,23519499,53810,61.0,693.0,72.0,101.0
WHITE,30040446,83958,110.0,1160.0,129.0,235.0


In [27]:
offense_df_post.head()

Unnamed: 0_level_0,bond_amount,days,class_ M,class_AM,class_BM,class_CM
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BLACK,28242761,92777,60.0,1002.0,64.0,93.0
HISPANIC,21831313,55650,54.0,716.0,52.0,63.0
WHITE,37081581,83308,111.0,1278.0,122.0,174.0


In [28]:
mis_pre_df.describe(include='all')

Unnamed: 0,download_date,identifier,latest_admission_date,race,gender,age,bond_amount,offense,facility,detainer,offense_class,days,counts,tens
count,3889,3889,3889,3889,3889,3889.0,3889.0,3889,3889,3889,3889,3889.0,3889.0,3889.0
unique,261,3544,515,5,2,,,74,17,7,5,,,
top,2016-08-19 00:00:00,ZZHWHEEJ,2016-07-05 00:00:00,WHITE,M,,,"FAILURE TO APPEAR, SECOND DEGREE AM",HARTFORD CC,NONE,AM,,,
freq,145,5,35,1635,3217,,,848,1168,3728,2892,,,
first,2016-07-01 00:00:00,,2000-09-22 00:00:00,,,,,,,,,,,
last,2017-06-30 00:00:00,,2017-06-29 00:00:00,,,,,,,,,,,
mean,,,,,,36.168167,21478.43,,,,,59.048084,27.860375,31.766521
std,,,,,,11.362481,45217.67,,,,,154.314135,35.266749,11.58885
min,,,,,,18.0,1.0,,,,,1.0,1.0,10.0
25%,,,,,,27.0,2500.0,,,,,4.0,3.0,20.0


In [29]:
mis_post_df.describe(include='all')

Unnamed: 0,download_date,identifier,latest_admission_date,race,gender,age,bond_amount,offense,facility,detainer,offense_class,days,counts,tens
count,3821,3821,3821,3821,3821,3821.0,3821.0,3821,3821,3821,3821,3821.0,3821.0,3821.0
unique,276,3440,528,5,2,,,63,15,6,4,,,
top,2017-10-23 00:00:00,ZZSEJLHC,2018-01-22 00:00:00,WHITE,M,,,"FAILURE TO APPEAR, SECOND DEGREE AM",HARTFORD CC,NONE,AM,,,
freq,105,7,38,1685,3168,,,947,1173,3640,3024,,,
first,2017-07-02 00:00:00,,1999-09-20 00:00:00,,,,,,,,,,,
last,2018-06-29 00:00:00,,2018-06-28 00:00:00,,,,,,,,,,,
mean,,,,,,36.356975,22959.93,,,,,60.962575,35.889034,31.818896
std,,,,,,11.307125,48139.73,,,,,164.002986,54.637029,11.568809
min,,,,,,16.0,1.0,,,,,1.0,1.0,10.0
25%,,,,,,27.0,2500.0,,,,,3.0,3.0,20.0


In [30]:
# Is there a difference between racial makeup between the first year and second?  

white1 = len(mis_pre_df[mis_pre_df.race=='WHITE'])
min1 = len(mis_pre_df[mis_pre_df.race!='WHITE'])
total1 = len(mis_pre_df.race)
white2 = len(mis_post_df[mis_post_df.race=='WHITE'])
min2 = len(mis_post_df[mis_post_df.race!='WHITE'])
total2 = len(mis_post_df.race)
p_hat = (min1+min2)/(total1+total2)
p1 = min1/total1
p2 = min2/total2
tail = 2

observed_diff = p1-p2

z = (observed_diff-0) / np.sqrt(p_hat*(1-p_hat)*((1/total1)+(1/total2)))
z_crit = stats.norm.ppf(.95)
moe = z_crit*np.sqrt((p1/total1)+(p2/total2))
ci_high = observed_diff + moe
ci_low = observed_diff - moe
z_pval = (1-stats.norm.cdf(z))*tail
    
print('The margin of error is', '{:.7f}'.format(moe), '(', '{:.2f}'.format(moe*100), 'percent )')
print('The 95% confidence interval is', '{:.7f}'.format(ci_low), 'to', '{:.7f}'.format(ci_high), '(','{:.2f}'.format(ci_low*100), 'to', '{:.2f}'.format(ci_high*100), 'percent difference )')
print('The p-value is', z_pval, '(', '{:.7f}'.format(z_pval), ')')

The margin of error is 0.0282672 ( 2.83 percent )
The 95% confidence interval is -0.0076997 to 0.0488347 ( -0.77 to 4.88 percent difference )
The p-value is 0.06822222656169519 ( 0.0682222 )


In [31]:
mis_pre_df.bond_amount.describe()

count    3.889000e+03
mean     2.147843e+04
std      4.521767e+04
min      1.000000e+00
25%      2.500000e+03
50%      7.500000e+03
75%      2.500000e+04
max      1.025000e+06
Name: bond_amount, dtype: float64

In [32]:
mis_post_df.bond_amount.describe()

count    3.821000e+03
mean     2.295993e+04
std      4.813973e+04
min      1.000000e+00
25%      2.500000e+03
50%      1.000000e+04
75%      2.500000e+04
max      1.000150e+06
Name: bond_amount, dtype: float64

In [33]:
pre_race_gb = mis_pre_df.groupby(['race']).median().drop(['tens', 'counts', 'age'], axis=1).drop(['AMER IND', 'ASIAN'], axis=0)
pre_race_gb

Unnamed: 0_level_0,bond_amount,days
race,Unnamed: 1_level_1,Unnamed: 2_level_1
BLACK,7500,22
HISPANIC,8500,21
WHITE,7500,23


In [34]:
#mis_post_df['total'] = mis_post_df.groupby('race')['race'].transform('count')
post_race_gb = mis_post_df.groupby(['race']).median().drop(['tens', 'counts', 'age'], axis=1).drop(['AMER IND', 'ASIAN'], axis=0)
post_race_gb

Unnamed: 0_level_0,bond_amount,days
race,Unnamed: 1_level_1,Unnamed: 2_level_1
BLACK,9000.0,15.0
HISPANIC,10000.0,14.0
WHITE,7500.0,15.0


In [35]:
chi, p = stats.chisquare(post_race_gb.days, pre_race_gb.days)
chi, p

(7.343214756258234, 0.025435552519779896)

In [36]:
chi, p = stats.chisquare(post_race_gb.bond_amount, pre_race_gb.bond_amount)
chi, p

(564.7058823529412, 2.3750660836565362e-123)

In [37]:
stat, p, dof, expected = stats.chi2_contingency(post_race_gb, pre_race_gb)

# interpret test-statistic
prob = 0.95
critical = stats.chi2.ppf(prob, dof)
if abs(stat) >= critical:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')
alpha = 1.0 - prob
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')
    
stat, p, dof, expected, alpha, critical

Independent (fail to reject H0)
Independent (fail to reject H0)


(0.9279343061987403,
 0.6287842077402879,
 2,
 array([[9000.05650995,   14.94349005],
        [9997.4005425 ,   16.5994575 ],
        [7502.54294756,   12.45705244]]),
 0.050000000000000044,
 5.991464547107979)

In [38]:
def prop_test(col1, col2, orig_col, tail=2, percent=.975):
    '''I'm not sure if I need this in this section.'''
    n1 = len(col1)
    n2 = len(col2)
    n3 = len(orig_col)
    
    pro1 = np.mean(col1)
    pro2 = np.mean(col2)
    pro3 = (np.sum(col1)+np.sum(col2))/n3
    
    p_1 = pro1*(1-pro1)
    p_2 = pro2*(1-pro2)
    p_3 = pro3*(1-pro3)
    
    observed_diff = diff_of_means(col1, col2)
    
    z = observed_diff / np.sqrt(p_3*((1/n1)+(1/n2)))
    z_crit = stats.norm.ppf(percent)
    moe = z_crit*np.sqrt((p_1/n1)+(p_2/n2))
    ci_high = observed_diff + moe
    ci_low = observed_diff - moe
    z_pval = (1-stats.norm.cdf(z))*tail
    
    print('The margin of error is', '{:.7f}'.format(moe), '(', '{:.2f}'.format(moe*100), 'percent )')
    print('The 95% confidence interval is', '{:.7f}'.format(ci_low), 'to', '{:.7f}'.format(ci_high), '(','{:.2f}'.format(ci_low*100), 'to', '{:.2f}'.format(ci_high*100), 'percent difference )')
    print('The p-value is', z_pval, '(', '{:.7f}'.format(z_pval), ')')