In [1]:
# Inferential Statistics for CT Pretrial Detainees
# (Springboard Capstone 1)
# 2019, Misty M. Giles

# Import everything.  
%matplotlib inline
from datetime import datetime as date
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns

# Select the file created in CT_csv (unit 5 data wrangling assignment)
file = 'detainees_classed_offenses.csv'

In [2]:
# Read in the file
df = pd.read_csv(file, parse_dates=['download_date', 'latest_admission_date'])

# This corrects a typo that's being fixed in CT_csv.
df['offense_class'] = [offense[-2:] if offense.endswith(('AM','BM','CM','DM','UM',' M', 
                       'AF','BF','CF','DF','UF',' F')) else np.nan for offense in df.offense]

# Days that CT says a detainee has been in the system.  This doesn't account for
# some detainees.  There are 730 days in the dataset, and the state says that entrance 
# dates over a year before could be original entrance date but are definitely not to be
# trusted.  The days column will be capped at 1,095 (730 + 365).  This is definitely going
#  to need further investigation.
df.days = [int(time[:-23]) for time in df.days]
df.days = [time if time <= 1095 else 1095 for time in df.days]

# Create age ranges of decades.  For some reason, decades can't be used as a column header.
df['tens'] = [int((age // 10) * 10) for age in df.age] 

# Check that everything worked.  In this EDA, there should be 0 null values.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28808 entries, 0 to 28807
Data columns (total 14 columns):
download_date            28808 non-null datetime64[ns]
identifier               28808 non-null object
latest_admission_date    28808 non-null datetime64[ns]
race                     28808 non-null object
gender                   28808 non-null object
age                      28808 non-null int64
bond_amount              28808 non-null int64
offense                  28808 non-null object
facility                 28808 non-null object
detainer                 28808 non-null object
offense_class            28808 non-null object
days                     28808 non-null int64
counts                   28808 non-null int64
tens                     28808 non-null int64
dtypes: datetime64[ns](2), int64(5), object(7)
memory usage: 3.1+ MB


In [3]:
# Separate out the two datasets, misdemeanor (primary) and felony.
felony_df = df.loc[df['offense_class'].str.endswith('F')]
misdemeanor_df = df.loc[df['offense_class'].str.endswith('M')]

# Divide the misdemeanor data into pre-enactment and post.
date_pre = pd.date_range(start='7/1/2016', end='6/30/2017', freq='D').tolist()
date_post = pd.date_range(start='7/1/2017', end='6/30/2018', freq='D').tolist()
date_end = pd.date_range(start='7/1/2018', end='11/30/2018', freq='D').tolist()
mis_pre_df = misdemeanor_df.loc[misdemeanor_df.download_date < '2017-07-01']
mis_post_df = misdemeanor_df.loc[misdemeanor_df.download_date >= '2017-07-01']
mis_post_df = mis_post_df.loc[mis_post_df.download_date < '2018-07-01']
mis_outofbounds_df = misdemeanor_df.loc[misdemeanor_df.download_date >= '2018-07-01']

# Bin edges to more easily see age differences.
bin_edges = [10, 20, 30, 40, 50, 60, 70, 80, 90] 

# Bin edges for non-age plots.  Sqrt to avoid 'bin bias.'
bins_sqrt = int(np.sqrt(len(misdemeanor_df.bond_amount))) 

# Set up an ecdf function to quickly see distributions.
def ecdf(data):
    '''Compute continuous distribution function for one column'''
    n = len(data)
    x = np.sort(data)
    y = np.arange(1, n+1) / n
    return x, y

# Set up a function to generate bootstrap replicates
def bs_reps(data, func, size=1):
    '''Draw boostrap replicates'''
    # Initialize empty array
    bs_replicates = np.empty(size)
    # Generate replicates
    for i in range(size):
        bs_replicates[i] = func(np.random.choice(data, size=len(data)))
    return bs_replicates

# Set up a function to permute data sets
def permutation_sample(data1, data2):
    '''Generate a permuted sample from two data sets'''
    # Concatenate the data sets
    data = np.concatenate((data1, data2))
    # Permute the concatenated array
    permuted_data = np.random.permutation(data)
    # Split the permuted data back into two data sets
    perm_sample_1 = permuted_data[:len(data1)]
    perm_sample_2 = permuted_data[len(data1):]
    return perm_sample_1, perm_sample_2

# Set up a function to generate permutation replicates
def draw_perm_reps(data1, data2, func, size=1):
    '''Draw permutation replicates'''
    # Initialize empty array of proper size
    perm_replicates = np.empty(size)
    for i in range(size):
        # Generate permutation sample
        perm_sample_1, perm_sample_2 = permutation_sample(data1, data2)
        # Compute the test statistic
        perm_replicates[i] = func(perm_sample_1, perm_sample_2)
    return perm_replicates

# Set up a function to calculate differences of means
def diff_of_means(data1, data2):
    '''Calculate difference of means of two arrays'''
    diff = np.mean(data1) - np.mean(data2)
    return diff

def num_means(column):
    '''Get the statistics, mean, and CI for a column'''
    # Get statsitics for column
    m = np.mean(column) # mean of sample
    n = len(column) # sample size
    s = np.std(column) # stdev of sample
    sem = s / np.sqrt(n) # sample error
    
    # Get several bootstrap replicates for column
    bs_replicates = bs_reps(column, np.mean, 10000)
    
    # Get confidence interval
    bootstrap_ci = np.percentile(bs_replicates, [2.5, 97.5])
    
    return bs_replicates, bootstrap_ci

def ptest(column1, column2, test_col):
    '''Get p-value for bootstraps'''
    reps1, ci1 = num_means(column1)
    reps2, ci2 = num_means(column2)
    
    #pval_bs = np.sum((reps1 >= np.mean(test_col)) + (reps2 >= np.mean(test_col))) / (len(reps1) + len(reps2))
    pval_bs1 = np.sum((reps1 >= np.mean(test_col))) / len(reps1)
    pval_bs2 = np.sum((reps2 >= np.mean(test_col))) / len(reps2)
    
    # Print the test results and stats
    print('Bootstrap p-value 1 = ', '%.4f' % pval_bs1)
    print('Bootstrap p-value 2 = ', '%.4f' % pval_bs2)
    print('Mean of observed data     = ', np.mean(test_col))
    print('Bootstrap mean of column1 = ', np.mean(reps1))
    print('Bootstrap mean of column2 = ', np.mean(reps2))
    print('Bootstrap CI column1:\t', ci1[0], ',', ci1[1])
    print('Bootstrap CI column2:\t', ci2[0], ',', ci2[1])

# And for reference, the number of bins in non-age plots:
bins_sqrt

# Seed
np.random.seed(42)

In [4]:
mis_pre_df.head()

Unnamed: 0,download_date,identifier,latest_admission_date,race,gender,age,bond_amount,offense,facility,detainer,offense_class,days,counts,tens
3,2016-10-11,ZZEBBBSZ,2016-10-07,BLACK,F,57,2500,THREATENING AM,YORK CI,NONE,AM,4,3,50
13,2016-07-13,ZZEBBEWZ,2016-06-17,WHITE,M,63,5000,"ASSAULT, THIRD DEGREE AM",CORRIGAN CI,NONE,AM,26,13,60
14,2017-04-17,ZZEBBHER,2017-01-30,WHITE,M,50,19050,"CRIMINAL TRESPASS, FIRST DEGREE AM",HARTFORD CC,NONE,AM,77,77,50
15,2016-12-06,ZZEBBHER,2016-11-09,WHITE,M,50,500,"CRIMINAL TRESPASS, FIRST DEGREE AM",HARTFORD CC,NONE,AM,27,18,50
16,2017-01-05,ZZEBBHER,2016-12-15,WHITE,M,50,10250,"CRIMINAL TRESPASS, FIRST DEGREE AM",HARTFORD CC,NONE,AM,21,21,50


In [5]:
mis_post_df.head()

Unnamed: 0,download_date,identifier,latest_admission_date,race,gender,age,bond_amount,offense,facility,detainer,offense_class,days,counts,tens
2,2017-07-13,ZZEBBBJW,2017-07-06,WHITE,M,44,100000,CRIM VIOL OF PROTECTIVE ORDER AM,HARTFORD CC,NONE,AM,7,7,40
4,2017-09-13,ZZEBBBZJ,2017-09-12,BLACK,M,43,20000,THREATENING AM,BRIDGEPORT CC,NONE,AM,1,1,40
5,2018-05-09,ZZEBBBZJ,2018-04-24,BLACK,M,44,150000,THREATENING AM,BRIDGEPORT CC,NONE,AM,15,15,40
8,2018-01-02,ZZEBBCRZ,2017-12-29,WHITE,M,47,2500,"ASSAULT, THIRD DEGREE AM",NEW HAVEN CC,NONE,AM,4,4,40
9,2018-03-22,ZZEBBCRZ,2018-03-21,HISPANIC,M,47,2500,CRIM VIOL OF PROTECTIVE ORDER AM,NEW HAVEN CC,NONE,AM,1,1,40


In [6]:
mis_outofbounds_df.head()

Unnamed: 0,download_date,identifier,latest_admission_date,race,gender,age,bond_amount,offense,facility,detainer,offense_class,days,counts,tens
7,2018-11-11,ZZEBBCRW,2018-10-10,WHITE,M,55,20000,"CRIMINAL TRESPASS, FIRST DEGREE AM",BRIDGEPORT CC,NONE,AM,32,32,50
11,2018-07-23,ZZEBBEBS,2018-07-09,HISPANIC,M,46,5000,"FAILURE TO APPEAR, SECOND DEGREE AM",HARTFORD CC,NONE,AM,14,14,40
27,2018-09-24,ZZEBBSBS,2018-08-15,BLACK,M,56,2100,DISORDERLY CONDUCT CM,HARTFORD CC,NONE,CM,40,40,50
28,2018-11-11,ZZEBBSLH,2018-05-14,WHITE,M,56,41000,ILL OP MV WO IGNITION DEVICE CM,CORRIGAN CI,NONE,CM,181,179,50
35,2018-08-16,ZZEBCBSZ,2018-07-18,HISPANIC,M,53,20000,"CRIMINAL TRESPASS, FIRST DEGREE AM",BRIDGEPORT CC,NONE,AM,29,29,50


In [7]:
mis_pre_df.shape, mis_post_df.shape, mis_outofbounds_df.shape

((3889, 14), (3821, 14), (1954, 14))

In [8]:
misdemeanor_df.shape

(9664, 14)

In [9]:
mis_pre_df.describe()

Unnamed: 0,age,bond_amount,days,counts,tens
count,3889.0,3889.0,3889.0,3889.0,3889.0
mean,36.168167,21478.43,59.048084,27.860375,31.766521
std,11.362481,45217.67,154.314135,35.266749,11.58885
min,18.0,1.0,1.0,1.0,10.0
25%,27.0,2500.0,4.0,3.0,20.0
50%,34.0,7500.0,22.0,16.0,30.0
75%,45.0,25000.0,54.0,38.0,40.0
max,75.0,1025000.0,1095.0,287.0,70.0


Okay, now that everything is set up, I'm going to start looking for the variables of interest: gender, race, age, offense_class, days, and bond_amount.  Are any of these numeric variables normal?

In [10]:
chisq, pval = stats.normaltest(mis_pre_df.age)
if pval < .05:
    print('Normal distribution IS NOT a good fit for age.  (p-value = ', pval, ')')
else:
    print('Normal distribution IS a good fit for age.  (p-value = ', pval, ')')
    
chisq, pval = stats.normaltest(mis_pre_df.days)
if pval < .05:
    print('Normal distribution IS NOT a good fit for days.  (p-value = ', pval, ')')
else:
    print('Normal distribution IS a good fit for days.  (p-value = ', pval, ')')
    
chisq, pval = stats.normaltest(mis_pre_df.bond_amount)
if pval < .05:
    print('Normal distribution IS NOT a good fit for bond.  (p-value = ', pval, ')')
else:
    print('Normal distribution IS a good fit for bond.  (p-value = ', pval, ')')

Normal distribution IS NOT a good fit for age.  (p-value =  2.557558769554361e-57 )
Normal distribution IS NOT a good fit for days.  (p-value =  0.0 )
Normal distribution IS NOT a good fit for bond.  (p-value =  0.0 )


##### First set of tests: gender

Below we'll look at the bond and days means and see who is expected to stay longer and pay more.

For both variables:
H0: male == female
Ha: male != female 
alpha = .05

Fascinatingly, the p-values for both means are about .49, a value high enough that we fail to reject H0 for both tests.  The bond amount and days detained for both genders can be considered statistically equal.

In [11]:
# Determine the populations
female = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.gender == 'F']
male = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.gender == 'M']

In [12]:
# Test female vs. male for bond amounts
ptest(female.bond_amount, male.bond_amount, mis_pre_df.bond_amount)

Bootstrap p-value 1 =  0.0000
Bootstrap p-value 2 =  0.9759
Mean of observed data     =  21478.429673437902
Bootstrap mean of column1 =  13762.90676607143
Bootstrap mean of column2 =  23092.35926677028
Bootstrap CI column1:	 11656.23947172619 , 16276.998474702381
Bootstrap CI column2:	 21500.68378924464 , 24784.367819396954


In [13]:
# Test female vs. male for days detained
ptest(female.days, male.days, mis_pre_df.days)

Bootstrap p-value 1 =  0.0000
Bootstrap p-value 2 =  0.9733
Mean of observed data     =  59.04808434044742
Bootstrap mean of column1 =  32.78871324404762
Bootstrap mean of column2 =  64.50434311470313
Bootstrap CI column1:	 27.065438988095238 , 39.52529761904762
Bootstrap CI column2:	 58.96423686664594 , 70.2545927883121


##### Second set of tests: race

Below we'll again look at the bond and days means and see who is expected to stay longer and pay more.

For both variables: 
* H0: white == black / minority 
* Ha: white != black / minority
* alpha = .05

Interestingly, we refuse to reject the null hypothesis for all of the bond amounts.  For the days detained, however, we would reject the null for Asians and Native Americans.  This sample is too small to come to this conclusion.  (52 Asians+Native Americans out of 3889 detainees, or 1.3% of the sample)

In [14]:
# Determine the populations
race_df = mis_pre_df[['bond_amount', 'days', 'race']].copy()
race_df['r'] = mis_pre_df[['race']].copy()
race_df = pd.get_dummies(race_df, columns=['r'])
white = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.race == 'WHITE']
minority = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.race != 'WHITE']
black = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.race == 'BLACK']
hispanic = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.race == 'HISPANIC']
asian = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.race == 'ASIAN']
amer_ind = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.race == 'AMER IND']

In [15]:
# Test white vs. non-white for bond amount
ptest(white.bond_amount, minority.bond_amount, mis_pre_df.bond_amount)

Bootstrap p-value 1 =  0.0004
Bootstrap p-value 2 =  0.9857
Mean of observed data     =  21478.429673437902
Bootstrap mean of column1 =  18374.43814360856
Bootstrap mean of column2 =  23727.593361180123
Bootstrap CI column1:	 16874.362048929663 , 19995.959678899082
Bootstrap CI column2:	 21669.06462954747 , 25996.33307453416


In [16]:
# Test white vs. black for bond amount
ptest(white.bond_amount, black.bond_amount, mis_pre_df.bond_amount)

Bootstrap p-value 1 =  0.0003
Bootstrap p-value 2 =  0.8363
Mean of observed data     =  21478.429673437902
Bootstrap mean of column1 =  18377.590293272173
Bootstrap mean of column2 =  22832.748686588235
Bootstrap CI column1:	 16816.43778287462 , 20030.59110091743
Bootstrap CI column2:	 20285.76745098039 , 25627.996117647057


In [17]:
# Test white vs. hispanic for bond amount
ptest(white.bond_amount, hispanic.bond_amount, mis_pre_df.bond_amount)

Bootstrap p-value 1 =  0.0001
Bootstrap p-value 2 =  0.9894
Mean of observed data     =  21478.429673437902
Bootstrap mean of column1 =  18355.375782018345
Bootstrap mean of column2 =  25370.432859115426
Bootstrap CI column1:	 16810.526911314984 , 19978.124250764522
Bootstrap CI column2:	 21967.047491909383 , 29277.281499460627


In [18]:
# Test white vs. Asian for bond amount
ptest(white.bond_amount, asian.bond_amount, mis_pre_df.bond_amount)

Bootstrap p-value 1 =  0.0001
Bootstrap p-value 2 =  0.1651
Mean of observed data     =  21478.429673437902
Bootstrap mean of column1 =  18373.99280281346
Bootstrap mean of column2 =  15800.018457142856
Bootstrap CI column1:	 16777.859801223243 , 20002.29510703364
Bootstrap CI column2:	 6994.304285714286 , 28957.449285714283


In [None]:
# Test white vs. Native American for bond amount
ptest(white.bond_amount, amer_ind.bond_amount, mis_pre_df.bond_amount)

Bootstrap p-value 1 =  0.0000
Bootstrap p-value 2 =  0.2926
Mean of observed data     =  21478.429673437902
Bootstrap mean of column1 =  18375.66690434251
Bootstrap mean of column2 =  18882.131470588236
Bootstrap CI column1:	 16848.421269113154 , 20007.73085626911
Bootstrap CI column2:	 9852.573529411764 , 29338.308823529413


In [None]:
# Test white vs. non-white for days detained
ptest(white.days, minority.days, mis_pre_df.days)

Bootstrap p-value 1 =  0.0094
Bootstrap p-value 2 =  0.9414
Mean of observed data     =  59.04808434044742
Bootstrap mean of column1 =  51.33946042813455
Bootstrap mean of column2 =  64.62605550133095
Bootstrap CI column1:	 45.38275229357798 , 57.77081039755351
Bootstrap CI column2:	 57.66500665483585 , 71.88664596273291


In [None]:
# Test white vs. black for days detained
ptest(white.days, black.days, mis_pre_df.days)

Bootstrap p-value 1 =  0.0112
Bootstrap p-value 2 =  0.9882
Mean of observed data     =  59.04808434044742
Bootstrap mean of column1 =  51.384512048929665
Bootstrap mean of column2 =  70.30228690196078
Bootstrap CI column1:	 45.40266055045871 , 57.79093272171254
Bootstrap CI column2:	 60.5293137254902 , 80.98123529411765


In [None]:
# Test white vs. hispanic for days detained
ptest(white.days, hispanic.days, mis_pre_df.days)

Bootstrap p-value 1 =  0.0104
Bootstrap p-value 2 =  0.4176
Mean of observed data     =  59.04808434044742
Bootstrap mean of column1 =  51.349767461773695
Bootstrap mean of column2 =  58.1319405609493
Bootstrap CI column1:	 45.33923547400612 , 57.76964831804281
Bootstrap CI column2:	 48.854314994606256 , 68.51504854368932


In [None]:
# Test white vs. Asian for days detained
ptest(white.days, asian.days, mis_pre_df.days)

Bootstrap p-value 1 =  0.0089
Bootstrap p-value 2 =  0.0490
Mean of observed data     =  59.04808434044742
Bootstrap mean of column1 =  51.340873027522946
Bootstrap mean of column2 =  42.12347428571428
Bootstrap CI column1:	 45.36755351681957 , 57.719877675840976
Bootstrap CI column2:	 25.513571428571428 , 62.200714285714284


In [None]:
# Test white vs. Native American for days detained
ptest(white.days, amer_ind.days, mis_pre_df.days)

Bootstrap p-value 1 =  0.0125
Bootstrap p-value 2 =  0.0297
Mean of observed data     =  59.04808434044742
Bootstrap mean of column1 =  51.355428746177374
Bootstrap mean of column2 =  41.36328235294118
Bootstrap CI column1:	 45.42076452599388 , 57.90492354740061
Bootstrap CI column2:	 24.0 , 59.76617647058821


##### Third set of tests: offense class

Below we'll again look at the bond and days means and see if the class of offense affects bond amount or days detained.

For both variables: 


In [None]:
offense_df = mis_pre_df[['bond_amount', 'days', 'offense_class']].copy()
offense_df['class'] = mis_pre_df[['offense_class']].copy()
offense_df = pd.get_dummies(offense_df, columns=['class'])
offense_am = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.offense_class == 'AM']
offense_bm = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.offense_class == 'BM']
offense_cm = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.offense_class == 'CM']
offense_dm = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.offense_class == 'DM']
offense_um = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.offense_class == 'UM']
offense_m  = mis_pre_df[['bond_amount', 'days']].loc[mis_pre_df.offense_class == ' M']

In [None]:
# Test A-class vs. all others for bond amount
ptest(offense_am.bond_amount, mis_pre_df.bond_amount.loc[mis_pre_df.offense_class != 'AM'], mis_pre_df.bond_amount)

Bootstrap p-value 1 =  0.9954
Bootstrap p-value 2 =  0.0000
Mean of observed data     =  21478.429673437902
Bootstrap mean of column1 =  23627.843946058092
Bootstrap mean of column2 =  15250.521114944833
Bootstrap CI column1:	 21994.73470781466 , 25443.472069502077
Bootstrap CI column2:	 13079.659227683049 , 17727.80173019057


In [None]:
# Test B-class vs. all others for bond amount
ptest(offense_bm.bond_amount, mis_pre_df.bond_amount.loc[mis_pre_df.offense_class != 'BM'], mis_pre_df.bond_amount)

Bootstrap p-value 1 =  0.0033
Bootstrap p-value 2 =  0.7130
Mean of observed data     =  21478.429673437902
Bootstrap mean of column1 =  16466.496372347265
Bootstrap mean of column2 =  21925.3974883175
Bootstrap CI column1:	 13498.385852090032 , 19883.43528938907
Bootstrap CI column2:	 20461.06203884852 , 23502.706309390724


In [None]:
# Test C-class vs. all others for bond amount
ptest(offense_cm.bond_amount, mis_pre_df.bond_amount.loc[mis_pre_df.offense_class != 'CM'], mis_pre_df.bond_amount)

Bootstrap p-value 1 =  0.0000
Bootstrap p-value 2 =  0.9263
Mean of observed data     =  21478.429673437902
Bootstrap mean of column1 =  13098.804236263735
Bootstrap mean of column2 =  22587.263966191033
Bootstrap CI column1:	 11004.981043956044 , 15419.392582417582
Bootstrap CI column2:	 21087.047502912053 , 24230.267494175885


In [None]:
# Test no-class vs. all others for bond amount
ptest(offense_m.bond_amount, mis_pre_df.bond_amount.loc[mis_pre_df.offense_class != ' M'], mis_pre_df.bond_amount)

Bootstrap p-value 1 =  0.1939
Bootstrap p-value 2 =  0.6080
Mean of observed data     =  21478.429673437902
Bootstrap mean of column1 =  18033.478913043476
Bootstrap mean of column2 =  21692.19786171085
Bootstrap CI column1:	 11034.331630434783 , 26650.046956521735
Bootstrap CI column2:	 20305.23628723695 , 23197.54087865537


No point in testing UM.  There's only one.

In [None]:
# Test A-class vs. all others for days detained
ptest(offense_am.days, mis_pre_df.days.loc[mis_pre_df.offense_class != 'AM'], mis_pre_df.days)

Bootstrap p-value 1 =  0.1852
Bootstrap p-value 2 =  0.8807
Mean of observed data     =  59.04808434044742
Bootstrap mean of column1 =  56.698283056708156
Bootstrap mean of column2 =  65.78428294884654
Bootstrap CI column1:	 51.715975103734436 , 62.0309215076072
Bootstrap CI column2:	 54.83653460381143 , 77.51356569709128


In [None]:
# Test B-class vs. all others for days detained
ptest(offense_bm.days, mis_pre_df.days.loc[mis_pre_df.offense_class != 'BM'], mis_pre_df.days)

Bootstrap p-value 1 =  0.8375
Bootstrap p-value 2 =  0.3647
Mean of observed data     =  59.04808434044742
Bootstrap mean of column1 =  69.28580064308682
Bootstrap mean of column2 =  58.1989180547792
Bootstrap CI column1:	 50.05779742765274 , 91.29292604501607
Bootstrap CI column2:	 53.3621366685299 , 63.15883873672443


In [None]:
# Test C-class vs. all others for days detained
ptest(offense_cm.days, mis_pre_df.days.loc[mis_pre_df.offense_class != 'CM'], mis_pre_df.days)

Bootstrap p-value 1 =  0.9792
Bootstrap p-value 2 =  0.1632
Mean of observed data     =  59.04808434044742
Bootstrap mean of column1 =  77.65425252747254
Bootstrap mean of column2 =  56.62044726266745
Bootstrap CI column1:	 59.83010989010989 , 97.40005494505496
Bootstrap CI column2:	 51.920908561444385 , 61.58505387303436


In [None]:
# Test no-class vs. all others for days detained
ptest(offense_m.days, mis_pre_df.days.loc[mis_pre_df.offense_class != ' M'], mis_pre_df.days)

In [None]:
def prop_test(col1, col2, orig_col, tail=2, percent=.975):
    '''I have no clue what this does or why it is here, tbh.  Wrote it thinking it would do something different.'''
    n1 = len(col1)
    n2 = len(col2)
    n3 = len(orig_col)
    
    pro1 = np.mean(col1)
    pro2 = np.mean(col2)
    pro3 = np.mean(orig_col)
    
    p_1 = pro1*(1-pro1)
    p_2 = pro2*(1-pro2)
    p_3 = pro3*(1-pro3)
    
    observed_diff = diff_of_means(col1, col2)
    
    z = observed_diff / np.sqrt(p_3*((1/n1)+(1/n2)))
    z_crit = stats.norm.ppf(percent)
    moe = z_crit*np.sqrt((p_1/n1)+(p_2/n2))
    ci_high = observed_diff + moe
    ci_low = observed_diff - moe
    z_pval = (1-stats.norm.cdf(z))*tail
    
    print('The margin of error is', '{:.7f}'.format(moe), '(', '{:.2f}'.format(moe*100), 'percent )')
    print('The 95% confidence interval is', '{:.7f}'.format(ci_low), 'to', '{:.7f}'.format(ci_high), '(','{:.2f}'.format(ci_low*100), 'to', '{:.2f}'.format(ci_high*100), 'percent difference )')
    print('The p-value is', z_pval, '(', '{:.7f}'.format(z_pval), ')')

In [None]:
offense_df.describe()

In [None]:
race_df.describe()

In [None]:
race = 

In [None]:
prop_test(race_df.r_WHITE, race_df.r_HISPANIC, race_df.race)

n_b = len(b.call)
n_w = len(w.call)
cb = callbacks

cb_p = cb*(1-cb)
cb_pb = callbacks_b*(1-callbacks_b)
cb_pw = callbacks_w*(1-callbacks_w)

tail = 2

z = observed_diff / np.sqrt(cb_p*((1/n_b)+(1/n_w)))
z_crit = stats.norm.ppf(.975)
moe = z_crit*np.sqrt((cb_pb/n_b)+(cb_pw/n_w))
ci_high = observed_diff + moe
ci_low = observed_diff - moe
z_pval = (1-stats.norm.cdf(z))*tail

print('The margin of error is', '{:.7f}'.format(moe), '(', '{:.2f}'.format(moe*100), 'percent )')
print('The 95% confidence interval is', '{:.7f}'.format(ci_low), 'to', '{:.7f}'.format(ci_high), '(','{:.2f}'.format(ci_low*100), 'to', '{:.2f}'.format(ci_high*100), 'percent difference )')
print('The p-value is', z_pval, '(', '{:.7f}'.format(z_pval), ')')