In [50]:
import geopandas as gpd
import pandas as pd
import numpy as np
import json
import os

In [51]:
# Misc:
web_base = 'https://data.sandiego.gov/datasets/?department=police'

#### What we want to keep as a general outline for data

stop_id

stop_cause / reason_for_stop

result_of_stop

stopdate

stoptime

service_area

block

subject_race / race --> map to race codes

beat (new) --> map to service area

actions_taken (new)

officer_assignment_key (new)

exp_years (new)

stopduration (new)

perceived_age (new)

In [52]:
def get_table(yr):
    if yr < 2018:
        url = 'http://seshat.datasd.org/pd/vehicle_stops_{}_datasd_v1.csv'.format(yr)
        return pd.read_csv(url)
    
    else: # post- 2018
        # cols we care about
        cols_2018 = ['stop_id', 'pid', 'date_stop', 'time_stop', 'stopduration', 
                 'officer_assignment_key', 'exp_years', 'beat', 
                 'perceived_age', 'gend']

        url = 'http://seshat.datasd.org/pd/ripa_stops_datasd_v1.csv'
        df = pd.read_csv(url)
        return df[cols_2018]

### columns we still need:

stop_cause / reason_for_stop

result_of_stop

actions_taken 

In [53]:
# other tables for POST- 2018
def get_merge_data():
    reason = 'stop_reason'
    reason_cols = ['stop_id', 'pid', 'reason_for_stop', 'reason_for_stopcode']

    result = 'stop_result'
    result_cols= ['stop_id', 'pid', 'result']

    race = 'race'
    race_cols = ['stop_id', 'pid', 'race']

    action = 'actions_taken'
    action_cols = ['stop_id', 'pid', 'action']

    base = 'http://seshat.datasd.org/pd/ripa_{}_datasd.csv'

    reason_df = pd.read_csv(base.format(reason))
    result_df = pd.read_csv(base.format(result))
    race_df = pd.read_csv(base.format(race))
    action_df = pd.read_csv(base.format(action))
    
    return [[reason_cols, result_cols, race_cols, action_cols], [reason_df, result_df, race_df, action_df]]

In [54]:
# get all the new cols from other urls
def gen_cols(df1, gen_df, cols):
    new = df1.merge(gen_df, on = ['stop_id', 'pid'])
    drop = [x for x in new.columns if x not in cols]
    new = new.drop(columns = drop).drop_duplicates(subset = ['stop_id', 'pid'])
    return new

In [55]:
def merge_data(tbl):
    merge_cols = get_merge_data()[0]
    merge_dfs = get_merge_data()[1]
    merged = []

    for i in range(len(merge_dfs)):
        merged.append(gen_cols(tbl, merge_dfs[i], merge_cols[i]))
          
    first = merged[0]
    for i in range(0, len(merged)):
        if i == len(merged) - 1:
            break 
        first = pd.merge(first, merged[i + 1], on =  ['stop_id', 'pid'])

    df = pd.merge(tbl, first, on = ['stop_id', 'pid'])
    df = df.drop_duplicates(subset = ['stop_id', 'pid'])
    return df

### Clean column names

* These are the same: 'stop_id', 'date_stop', 'time_stop', 'service_area', 'subject_race'/'Race Code', 'subject_sex'/'gend',


* These are excess for post- 2018: 'beat', 'officer_assignment_key', 'exp_years', 'perceived_age'


* These are excess for pre- 2018: 
    - 'sd_resident'
    - 'arrested'
    - 'searched'
    - 'obtained_consent'
    - 'contraband_found'
    - 'property_seized'


&rightarrow;  Therefore: Need to make 'subject_race' == 'Race Code' and  'subject_sex' == 'gend'


* Additionally for pre- 2018, need to engineer: 'outcome'

    -  from ['arrested', 'searched', 'property_seized'] in pre- 2018 and ['action', 'result'] in post- 2018


* Columns we are ok with having lots of null values because they are useful for analysis:

    - 'sd_resident'
    - 'beat'
    - 'officer_assignment_key'
    - 'exp_years'
    - 'perceived_age'

In [56]:
# only for PRE- 2018
def change_bool(string):
        if (string == 'Y') | (string =='y'):
            return 1
        if (string == 'N') | (string == 'n'):
            return 0
        return np.nan

def map_bool(cols, df):
    for col in cols:
        if col not in list(df.columns):
            continue
        df[col] = df[col].apply(lambda x: change_bool(x))
    return df

# only for POST- 2018
def change_sex(x):
    if (x == 1.0) | (x == 1):
        return 'M'
    elif (x == 2.0) | (x == 2):
        return 'F'
    else:
        return np.nan

def map_sex(col):
    return col.apply(lambda x: change_sex(x))

In [57]:
# accounts for both formats
def clean_bool(df, yr):
    if yr < 2018:
        c = ['sd_resident', 'searched', 'contraband_found', 'property_seized', 'arrested']
        
        return map_bool(c, df)
    
    else:
        df['gend'] = map_sex(df['gend'])
        return df

In [232]:
def clean_age(df, age_col = 'percieved_driver_age'):
    cts = df[age_col].value_counts() 
    bad_age = cts[cts <= 10].index
    return df[~df[age_col].isin(bad_age)]

In [233]:
# need for veil of darkness
def clean_time(df):
    df['time_stop'] = pd.to_datetime(df['time_stop'], format= '%H:%M', errors='coerce')
    return df

### columns we still need to map:

subject_race / race - map to race codes

beat - map to service area

In [58]:
# only for POST- 2018
def map_race(df):
    # engineered from existing file
    # race_codes = 'http://seshat.datasd.org/pd/vehicle_stops_race_codes.csv'
    # and intution

    race_dict = {
     'Asian' : 'A',
     'OTHER ASIAN': 'A',
     'Middle Eastern or South Asian': 'M',
     'BLACK': 'B',
     'CHINESE': 'C',
     'CAMBODIAN': 'D',
     'FILIPINO': 'F',
     'GUAMANIAN': 'G',
     'HISPANIC': 'H',
     'Hispanic/Latino/a': 'H',
     'INDIAN': 'I',
     'JAPANESE': 'J',
     'KOREAN': 'K',
     'LAOTIAN': 'L',
     'OTHER': 'O',
     'PACIFIC ISLANDER': 'P',
     'Pacific Islander': 'P',
     'SAMOAN': 'S',
     'HAWAIIAN': 'U',
     'VIETNAMESE': 'V',
     'WHITE': 'W',
     'White': 'W',
     'ASIAN INDIAN': 'Z',
     'Native American': 'N'
    }
    df['Race Code'] = df['race'].map(race_dict)
    return df

# only for POST- 2018
def map_service_area(df):
    stop_beats = 'http://seshat.datasd.org/sde/pd/pd_beats_datasd.geojson'
    beats = gpd.read_file(stop_beats)
    # get unique beats
    unique_beats = beats[['beat', 'serv']].drop_duplicates('beat')
    beat_dict = dict(zip(unique_beats.beat, unique_beats.serv))
    df['service_area'] = df['beat'].map(beat_dict)
    return df

In [71]:
def rename_cols(df, yr):
    if yr < 2018:
        df = df.rename(columns={'subject_age': 'percieved_driver_age', 'subject_sex': 'driver_sex', 
                                'subject_race': 'driver_race'})
        return df
    else:
        df = df.rename(columns={'perceived_age': 'percieved_driver_age', 'gend': 'driver_sex', 
                                'Race Code': 'driver_race', 'reason_for_stop': 'stop_cause'})
        return df

In [60]:
# Outcome possible values: 

# 'None'
# 'Warning (verbal or written)' 
# 'Search of property was conducted' 
# 'Property was seized' >> property seized
# 'Custodial Arrest without warrant'

def check_outcome(row, year):
    if year < 2018:
        # check 'arrested', 'searched', 'property_seized'
        if row.arrested == 1.0:
            return 'Arrest'
        elif row.property_seized == 1.0:
            return 'Property was seized'
        elif row.searched == 1.0:
            return 'Search of property was conducted'
        return 'Not Applicable'
        
    else:
        a = row.action
        r = row.result
        
        # check worst outcome first
        if r == 'Custodial Arrest without warrant':
            return r
        elif (a == 'Search of property was conducted') | (a == 'Property was seized'):
            return a
        elif r == 'Warning (verbal or written)':
            return r
        
        return 'Not Applicable'
    
def outcome_map(df, yr):
    df_copy = df.copy()
    df_copy['outcome'] = df_copy.apply(lambda x: check_outcome(x, yr), axis=1)
    
    if yr < 2018:
        return df_copy.drop(columns=['arrested', 'property_seized', 'searched', 'obtained_consent', 'contraband_found'])
        
    return df_copy.drop(columns=['action', 'result'])

In [225]:
def pre_2018_format(df, yr):
    df = clean_bool(df, yr)
    df = rename_cols(df, yr)
    df = clean_age(df)
    df = clean_time(df)
    df = outcome_map(df, yr)
    return df

def post_2018_format(df, yr):
    df = clean_bool(df, yr)
    df = map_race(df)
    df = map_service_area(df)
    df = rename_cols(df, yr)
    df = clean_age(df)
    df = clean_time(df)
    df = outcome_map(df, yr)
    return df

In [87]:
def format_df(df, yr):
    if yr < 2018:
        cols = ['stop_id', 'stop_cause', 'date_stop', 'time_stop', 'outcome', 'service_area', 'driver_race', 
                'driver_sex', 'percieved_driver_age', 'sd_resident']
        df = pre_2018_format(df, yr)
        return df[cols]
    
    else:
        cols = ['stop_id', 'pid', 'stop_cause', 'reason_for_stopcode', 'date_stop', 'time_stop', 'stopduration', 
                'outcome', 'beat', 'service_area', 'driver_race', 'driver_sex', 'percieved_driver_age',
                'officer_assignment_key', 'exp_years', 'year']
        df = post_2018_format(df, yr)
        
        # check for last duplicate
        df = df.drop_duplicates(subset = ['stop_id', 'date_stop', 'time_stop', 'beat', 'officer_assignment_key'])
        
        # add year column for future concating / analysis by year
        df['year'] = [yr] * df.shape[0]
    
        return df[cols]

In [67]:
def get_data(yr):
    tbl = get_table(yr)
    if yr < 2018:
        return format_df(tbl, yr)

    else:
        df = merge_data(tbl)
        return format_df(df, yr)

# Part 1

#### Cleaning
Perform an initial EDA to statistically assess the quality of the data and its appropriateness for addressing the problem at hand, justifying data cleaning logic. This will likely address issues with accuracy, precision, and missingness of specific attributes, tying these issues to their possible impact over eventual results.

In [18]:
raw_2015 = get_table(2015)
raw_2015.head()

Unnamed: 0,stop_id,stop_cause,service_area,subject_race,subject_sex,subject_age,date_time,date_stop,time_stop,sd_resident,arrested,searched,obtained_consent,contraband_found,property_seized
0,1191287,Moving Violation,430,W,F,,,2015-01-01,,Y,N,N,,,
1,1191295,Moving Violation,520,B,M,,,2015-01-01,,Y,N,N,,,
2,1191275,Moving Violation,430,B,M,,,2015-01-01,,Y,N,N,,,
3,1191308,Moving Violation,520,W,M,,,2015-01-01,,N,N,N,,,
4,1191285,Moving Violation,430,W,F,,,2015-01-01,,Y,N,N,,,


In [152]:
raw_2018 = get_table(2018)
raw_2018.head()

  if (await self.run_code(code, result,  async_=asy)):


Unnamed: 0,stop_id,pid,date_stop,time_stop,stopduration,officer_assignment_key,exp_years,beat,perceived_age,gend
0,2443,1,2018-07-01,00:01:37,30,1,10,122,25,1
1,2444,1,2018-07-01,00:03:34,10,1,18,121,25,1
2,2447,1,2018-07-01,00:05:43,15,10,1,822,30,1
3,2447,2,2018-07-01,00:05:43,15,10,1,822,30,2
4,2448,1,2018-07-01,00:19:06,5,1,3,614,23,1


In [20]:
# need to clean data types
raw_2015.dtypes

stop_id              int64
stop_cause          object
service_area        object
subject_race        object
subject_sex         object
subject_age         object
date_time           object
date_stop           object
time_stop           object
sd_resident         object
arrested            object
searched            object
obtained_consent    object
contraband_found    object
property_seized     object
dtype: object

In [146]:
def clean_age(df, age_col):
    cts = df[age_col].value_counts() 
    bad_age = cts[cts <= 10].index
    print(bad_age)
    return df[~df[age_col].isin(bad_age)]

In [147]:
clean_age(raw_2015, 'subject_age').head()

Index(['3', '92', '1', '99', '5', '93', '6', '4', '7', '11', '3_', '2', '14',
       '96', '97', '223', '173', '13', '2005', 'F48', '98', '94', '4_', '180',
       '213', '224', 'f28', 'y', '399', '55Q', '3.7', '8', '`', '1020', '2_',
       '100', '233', 'j', 'f26', '222', '456', 'Y', '701', 'NN', '10', '125',
       '345', 'h', 'f', 'x', '387', '32`', '9', '243'],
      dtype='object')


Unnamed: 0,stop_id,stop_cause,service_area,subject_race,subject_sex,subject_age,date_time,date_stop,time_stop,sd_resident,arrested,searched,obtained_consent,contraband_found,property_seized
0,1191287,Moving Violation,430,W,F,,,2015-01-01,,Y,N,N,,,
1,1191295,Moving Violation,520,B,M,,,2015-01-01,,Y,N,N,,,
2,1191275,Moving Violation,430,B,M,,,2015-01-01,,Y,N,N,,,
3,1191308,Moving Violation,520,W,M,,,2015-01-01,,N,N,N,,,
4,1191285,Moving Violation,430,W,F,,,2015-01-01,,Y,N,N,,,


In [139]:
(t.shape[0] / raw_2015[~raw_2015['subject_age'].isnull()].shape[0]) * 100

0.11907319868213723

In [96]:
# % null
n = raw_2015.isnull().sum()
pd.Series(n.values / raw_2015.shape[0], index = n.index)

stop_id             0.000000
stop_cause          0.000399
service_area        0.000000
subject_race        0.002183
subject_sex         0.001776
subject_age         0.032282
date_time           0.003613
date_stop           0.000000
time_stop           0.002201
sd_resident         0.094895
arrested            0.076545
searched            0.081770
obtained_consent    0.949273
contraband_found    0.946899
property_seized     0.947263
dtype: float64

In [107]:
n2 = raw_2015[(raw_2015['searched'] == 'Y') | (raw_2015['searched'] == 'y')]
n3 = n2.isnull().sum()
n3

stop_id                0
stop_cause             0
service_area           0
subject_race           2
subject_sex            2
subject_age           24
date_time              4
date_stop              0
time_stop              1
sd_resident           66
arrested              25
searched               0
obtained_consent    1790
contraband_found    1623
property_seized     1669
dtype: int64

In [108]:
# % null
pd.Series(n3.values / n2.shape[0], index = n3.index)

stop_id             0.000000
stop_cause          0.000000
service_area        0.000000
subject_race        0.000457
subject_sex         0.000457
subject_age         0.005483
date_time           0.000914
date_stop           0.000000
time_stop           0.000228
sd_resident         0.015079
arrested            0.005712
searched            0.000000
obtained_consent    0.408956
contraband_found    0.370802
property_seized     0.381311
dtype: float64

In [21]:
raw_2015.stop_id.value_counts().sort_values(ascending=False)

1223866    2
1262708    2
1255327    2
1234416    2
1224351    2
          ..
1238307    1
1236260    1
1234213    1
1232166    1
1262205    1
Name: stop_id, Length: 113641, dtype: int64

In [22]:
# cant drop duplicate stop_ids
raw_2015[(raw_2015['stop_id'] == 1281833) | (raw_2015['stop_id'] == 1226448) | (raw_2015['stop_id'] == 1227761)]

Unnamed: 0,stop_id,stop_cause,service_area,subject_race,subject_sex,subject_age,date_time,date_stop,time_stop,sd_resident,arrested,searched,obtained_consent,contraband_found,property_seized
36629,1226448,Moving Violation,810,H,M,35,2015-04-21 09:55:00,2015-04-21,9:55,Y,N,N,,,
37200,1226448,Equipment Violation,110,A,M,28,2015-04-21 19:35:00,2015-04-21,19:35,Y,N,N,,,
38140,1227761,Moving Violation,110,W,F,84,,2015-04-25,0:95,Y,N,N,,,
38220,1227761,Equipment Violation,120,W,F,22,2015-04-25 01:45:00,2015-04-25,1:45,Y,N,N,,,
87918,1281833,Moving Violation,120,W,F,30,2015-09-30 07:05:00,2015-09-30,7:05,Y,N,N,,,
89012,1281833,Moving Violation,440,H,M,20,2015-10-04 01:30:00,2015-10-04,1:30,Y,N,N,,,


In [23]:
# percentage of duplicates
raw_2015.stop_id.value_counts()[raw_2015.stop_id.value_counts() > 1].shape[0] / raw_2015.shape[0]

0.015430333905148065

#### Descriptive Stats
Statistically summarize the relevant, cleaned attributes and derived features (e.g. in univariate and bivariate analyses) for San Diego.

In [234]:
pre = get_data(2015)

In [235]:
pre.columns

Index(['stop_id', 'stop_cause', 'date_stop', 'time_stop', 'outcome',
       'service_area', 'driver_race', 'driver_sex', 'percieved_driver_age',
       'sd_resident'],
      dtype='object')

In [161]:
def get_dist(df, col):
    cur = df[col].value_counts()
    ser = pd.Series(cur.values / df.shape[0], index = cur.index)
    return ser

In [163]:
# univariate analysis: stop_cause, outcome, driver_race, percieved_driver_age, outcome

In [193]:
print(get_dist(pre, 'stop_cause'))

Moving Violation                      0.748531
Equipment Violation                   0.237910
Radio Call/Citizen Contact            0.004306
Muni, County, H&S Code                0.003015
Personal Knowledge/Informant          0.002175
No Cause Specified on a Card          0.001689
Suspect Info (I.S., Bulletin, Log)    0.001378
MUNI, County, H&S Code                0.000130
Personal Observ/Knowledge             0.000104
Other                                 0.000061
none listed                           0.000052
not marked                            0.000035
&Moving Violation                     0.000026
UNI, &County, H&&S Code               0.000026
NOT MARKED                            0.000017
Suspect Info                          0.000017
NOT SPECIFIED                         0.000017
NOTHING MARKED                        0.000017
not noted                             0.000017
Pedestrian                            0.000009
wm23                                  0.000009
not secified 

In [194]:
print(get_dist(pre, 'outcome'))

Not Applicable                      0.959098
Search of property was conducted    0.026407
Arrest                              0.011662
Property was seized                 0.002833
dtype: float64


In [195]:
print(get_dist(pre, 'driver_race'))

W    0.432162
H    0.298427
B    0.107406
O    0.073807
A    0.045139
F    0.013905
V    0.008213
C    0.005606
I    0.005112
K    0.001534
X    0.001274
J    0.001152
P    0.001048
Z    0.000953
S    0.000572
L    0.000546
D    0.000485
G    0.000321
U    0.000156
dtype: float64


In [196]:
print(get_dist(pre, 'percieved_driver_age').sort_values(ascending=False))

25     0.042028
30     0.036865
24     0.034681
23     0.033339
21     0.031788
         ...   
399    0.000009
55Q    0.000009
3.7    0.000009
8      0.000009
243    0.000009
Length: 133, dtype: float64


In [197]:
print(get_dist(pre, 'outcome').sort_values(ascending=False))

Not Applicable                      0.959098
Search of property was conducted    0.026407
Arrest                              0.011662
Property was seized                 0.002833
dtype: float64


In [167]:
# Bivariate Analysis

In [173]:
def get_cond_dist(df, col, cond):
    cur = df.groupby(cond)[col].value_counts()
    ser = pd.Series(cur.values / df.shape[0], index = cur.index)
    return ser

In [184]:
print(pd.DataFrame(get_cond_dist(pre, 'stop_cause', 'driver_race')).loc['W'].rename(columns={0:'White'}))
print(pd.DataFrame(get_cond_dist(pre, 'stop_cause', 'driver_race')).loc['B'].rename(columns={0:'Black'}))

                                       White
stop_cause                                  
Moving Violation                    0.345445
Equipment Violation                 0.080946
Radio Call/Citizen Contact          0.001750
Muni, County, H&S Code              0.001508
No Cause Specified on a Card        0.000884
Personal Knowledge/Informant        0.000832
Suspect Info (I.S., Bulletin, Log)  0.000416
MUNI, County, H&S Code              0.000061
Personal Observ/Knowledge           0.000035
&Moving Violation                   0.000026
none listed                         0.000026
Other                               0.000017
not marked                          0.000017
NOT CHECKED                         0.000009
NOT SPECIFIED                       0.000009
NOTHING MARKED                      0.000009
UNI, &County, H&&S Code             0.000009
not listed                          0.000009
not marked  not marked              0.000009
not noted                           0.000009
          

In [183]:
print(pd.DataFrame(get_cond_dist(pre, 'outcome', 'driver_race')).loc['W'].rename(columns={0:'White'}))
print(pd.DataFrame(get_cond_dist(pre, 'outcome', 'driver_race')).loc['B'].rename(columns={0:'Black'}))

                                     White
outcome                                   
Not Applicable                    0.419669
Search of property was conducted  0.006931
Arrest                            0.004808
Property was seized               0.000754
                                     Black
outcome                                   
Not Applicable                    0.098508
Search of property was conducted  0.006628
Arrest                            0.001871
Property was seized               0.000399


#### Traffic Stop Analysis
Calculate and document the differences in stop rates and post-stop outcomes. The analysis should address possible reasons for such differences (including addressing possible confounders). Additionally: - The significance of these differences should be tested using statistical inference. - These differences should also be calculated across other variables of interest (e.g. service area).

In [185]:
def search_rate(col):
    c = col.where(col == 'Search of property was conducted', 0)
    return sum(c) / col.shape[0]

In [186]:
def search_rate_by_attribute(df, attributes = None):     
    if attributes == None:
        return df.groupby('subject_race')['outcome'].apply(search_rate)
        
    else:
        return df.groupby(['subject_race', attributes])['outcome'].apply(search_rate)

In [188]:
def stop_rate(df, race):
    c = df[df['driver_race'] == race]
    return c.shape[0] / df.shape[0]

In [221]:
pd.DataFrame({'White': stop_rate(pre, 'W'), 'Black': stop_rate(pre, 'B')}, index = outcome_dist(pre, 'W').index).T

Unnamed: 0,Not Applicable,Search of property was conducted,Arrest,Property was seized
White,0.432162,0.432162,0.432162,0.432162
Black,0.107406,0.107406,0.107406,0.107406


In [207]:
def outcome_dist(df, race):
    a1 = df[df['driver_race'] == race]['outcome'].value_counts()
    a2 = df['outcome'].value_counts()
    return pd.Series(np.divide(a1.values, a2.values), index=a1.index)

In [219]:
pd.DataFrame({'White': outcome_dist(pre, 'W').values, 'Black': outcome_dist(pre, 'B').values}, index = outcome_dist(pre, 'W').index).T

Unnamed: 0,Not Applicable,Search of property was conducted,Arrest,Property was seized
White,0.437566,0.262467,0.412333,0.266055
Black,0.102709,0.250984,0.160475,0.140673


In [222]:
%matplotlib inline

import random
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [None]:
# use stat inference to test significance in difference

In [None]:
# calculate difference across othe variables of interest: service_area, age 

In [None]:
# use stat inference to test significance in difference

#### Veil of Darkness
Perform the Veil of Darkness analysis for San Diego. Include an introduction to the technique and interpret the results. (In Assignment #3 you will carefully address the shortcoming of this result.)

In [243]:
def filter_stop_reason(df):
    return df[(df['stop_cause'] == 'Moving Violation') | (df['stop_cause'] == 'Equipment Violation')]    

In [244]:
def get_veil(df):
    inter = df[(df['time_stop'] >=  pd.to_datetime('17:39', format= '%H:%M')) & (df['time_stop'] <= pd.to_datetime('20:59', format= '%H:%M'))]
    inter['time_stop'] = inter['time_stop'].apply(lambda x: x.time())
    
    inter = filter_stop_reason(inter)
    return inter

In [245]:
veil_pre = get_veil(pre)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [246]:
veil_pre.head()

Unnamed: 0,stop_id,stop_cause,date_stop,time_stop,outcome,service_area,driver_race,driver_sex,percieved_driver_age,sd_resident
199,1189897,Equipment Violation,2015-01-01,17:40:00,Property was seized,110,A,M,63,1.0
200,1189883,Equipment Violation,2015-01-01,17:40:00,Not Applicable,720,H,F,41,0.0
201,1189886,Moving Violation,2015-01-01,17:45:00,Not Applicable,720,H,M,76,0.0
202,1189882,Moving Violation,2015-01-01,17:45:00,Not Applicable,110,W,F,45,1.0
203,1189887,Moving Violation,2015-01-01,18:00:00,Not Applicable,720,W,M,25,0.0


In [None]:
# looking for average treatment affect

In [255]:
def veil_race(df, races=['W', 'B']):
    d = df[df['driver_race'].isin(races)]
    d['minority'] = d['driver_race'].apply(lambda x: 0 if x == 'W' else 1)
    return d

In [256]:
v = veil_race(veil_pre)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [266]:
# need total population for minority and non- minority
w_pop = 10
b_pop = 2

In [267]:
# Stop rates by minority vs not minority:
stop_rate(v, 'W') / w_pop

0.0771742356782428

In [268]:
stop_rate(v, 'B') / b_pop

0.11412882160878599

# Part 2

Develop code to clean data (as defined and justified in Part 1), create the features for the replication, and compute the statistics for the report. Such code should conform to the methodology portion of the course (e.g. using the project template).

In particular, your project should have a run.py with the following targets:

* data creates the data needed for analysis.
* process cleans and prepares the data for analysis (e.g. cleaning and feature creation).
* data-test ingests a small amount of test data (that process can then process).