In [1]:
## imports
import pandas as pd
import numpy as np
# import plotnine
# from plotnine import *
import random

## print multiple things from same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from datetime import datetime, timedelta

## Load data

In [2]:
## load data on 2020 crimes in DC
df = dc_crim_2020 = pd.read_csv("https://opendata.arcgis.com/datasets/f516e0dd7b614b088ad781b0c4002331_2.csv")

## create report_dt column
df['report_dt'] = pd.to_datetime(df.REPORT_DAT)

## Warm-up Demo

In [3]:
%%time
for i in range(df.shape[0]):
    r = df.iloc[i]
    r.X + r.Y

CPU times: user 1.71 s, sys: 7.93 ms, total: 1.72 s
Wall time: 1.72 s


In [4]:
%%time
for i,r in df.iterrows():
    r.X + r.Y

CPU times: user 1.05 s, sys: 9.86 ms, total: 1.06 s
Wall time: 1.06 s


In [5]:
%%time
df.apply(lambda r: r.X + r.Y, axis = 1)

CPU times: user 284 ms, sys: 8.16 ms, total: 292 ms
Wall time: 293 ms


0       -38.070933
1       -38.109483
2       -38.135099
3       -38.132757
4       -38.111191
           ...    
27928   -38.047079
27929   -38.129184
27930   -38.138364
27931   -38.070160
27932   -38.091576
Length: 27933, dtype: float64

In [6]:
%%time
## Super fast, but only works with built-in numpy functions.
df.X + df.Y

CPU times: user 4.05 ms, sys: 2.55 ms, total: 6.6 ms
Wall time: 9.88 ms


0       -38.070933
1       -38.109483
2       -38.135099
3       -38.132757
4       -38.111191
           ...    
27928   -38.047079
27929   -38.129184
27930   -38.138364
27931   -38.070160
27932   -38.091576
Length: 27933, dtype: float64

# Practice time

In [7]:
## define crimes to look for and crimes to look within
## CCN is Central Complaint Number: https://go.mpdconline.com/GO/GO_401_01.pdf
CCN_examples = ['20165648', '20123250']
C_Tar = crimes_lookfor = df[df.CCN.astype(str).isin(CCN_examples)][['CCN', 'WARD', 'OFFENSE', 'report_dt']]
C_Oth = other_crimes = df[~df.CCN.astype(str).isin(CCN_examples)]

## print crimes_lookfor
C_Tar.head()
# other_crimes.head()

Unnamed: 0,CCN,WARD,OFFENSE,report_dt
25399,20123250,2,MOTOR VEHICLE THEFT,2020-08-29 05:00:25+00:00
25672,20165648,6,MOTOR VEHICLE THEFT,2020-11-20 02:25:50+00:00


**Task**: we have two crimes we want to look for. We want to look in the remaining crime reports for crime reports that are:

- Located in the same ward as the two focal crimes
- Reported at the same time as the focal crime or up to 1000 minutes later (changed from slides which stated 20 mins since crime ids changed since last time so this long bandwidth helps us find matches!)

Solutions compare two ways to solve:

- Using a for loop
- Using a function

## 1. Loop approach

In [8]:
## create empty container to store results 
store_matches = {}

## loop through two example crimes
for i in range(C_Tar.shape[0]): # same as 
    
    ## extract row
    r = one_row = C_Tar.iloc[i]

    ## first, subset to crimes in same ward
    same_wards = C_Oth[C_Oth.WARD == r.WARD]
    
    ## second, with those same-ward crimes, construct indicator for reported within 20 minutes
    ## (interpreting as after but could do either)
    ### substep: get time cutoff
    CUTOFF = r.report_dt +  timedelta(minutes=1200)
    
    ### substep: use that to subset
    same_wards_sametime = same_wards[(same_wards.report_dt >= r.report_dt) & 
                                    (same_wards.report_dt <= CUTOFF)].copy()
    
    ## third, store the results
    store_matches[str(one_row.CCN)] = same_wards_sametime
    
## finally, concatenate results into one df
all_matches = pd.concat(store_matches)
all_matches.head()

Unnamed: 0,Unnamed: 1,X,Y,CCN,REPORT_DAT,SHIFT,METHOD,OFFENSE,BLOCK,XBLOCK,YBLOCK,...,CENSUS_TRACT,VOTING_PRECINCT,LATITUDE,LONGITUDE,BID,START_DATE,END_DATE,OBJECTID,OCTO_RECORD_ID,report_dt
20123250,13714,-77.027565,38.897353,20123609,2020/08/30 00:05:52+00,EVENING,OTHERS,MOTOR VEHICLE THEFT,1100 - 1199 BLOCK OF F STREET NW,397609.0,136611.0,...,5802.0,Precinct 129,38.897346,-77.027563,DOWNTOWN,2020/08/29 23:08:57+00,,405870634,,2020-08-30 00:05:52+00:00
20123250,20603,-77.040091,38.909646,20123389,2020/08/29 16:05:18+00,DAY,OTHERS,THEFT F/AUTO,1700 - 1799 BLOCK OF P STREET NW,396523.0,137976.0,...,5303.0,Precinct 15,38.909638,-77.040089,,2020/08/28 22:00:23+00,2020/08/29 08:00:27+00,405999410,,2020-08-29 16:05:18+00:00
20123250,21350,-77.039824,38.905656,20123507,2020/08/29 22:04:46+00,EVENING,OTHERS,MOTOR VEHICLE THEFT,1700 - 1779 BLOCK OF M STREET NW,396546.0,137533.0,...,10700.0,Precinct 17,38.905648,-77.039822,GOLDEN TRIANGLE,2020/08/27 19:01:24+00,2020/08/29 19:00:05+00,406001970,,2020-08-29 22:04:46+00:00
20123250,24343,-77.021929,38.899129,20123419,2020/08/29 17:15:19+00,DAY,OTHERS,THEFT/OTHER,700 - 799 BLOCK OF 7TH STREET NW,398098.0,136808.0,...,5801.0,Precinct 129,38.899121,-77.021926,DOWNTOWN,2020/08/29 16:05:40+00,2020/08/29 16:08:33+00,406055554,,2020-08-29 17:15:19+00:00
20123250,26745,-77.050528,38.913354,20123422,2020/08/29 16:45:57+00,DAY,OTHERS,THEFT F/AUTO,2200 - 2399 BLOCK OF DECATUR PLACE NW,395618.0,138388.0,...,4100.0,Precinct 13,38.913346,-77.050526,,2020/08/26 22:00:29+00,2020/08/27 12:00:51+00,406070569,,2020-08-29 16:45:57+00:00


# 1.5 Iterrow Approach

In [9]:
## create empty container to store results 
store_matches = {}

## loop through two example crimes
for i, r in C_Tar.iterrows(): # same as 

    ## first, subset to crimes in same ward
    same_wards = C_Oth[C_Oth.WARD == r.WARD]
    
    ## second, with those same-ward crimes, construct indicator for reported within 20 minutes
    ## (interpreting as after but could do either)
    ### substep: get time cutoff
    CUTOFF = r.report_dt +  timedelta(minutes=1200)
    
    ### substep: use that to subset
    same_wards_sametime = same_wards[(same_wards.report_dt >= r.report_dt) & 
                                    (same_wards.report_dt <= CUTOFF)].copy()
    
    ## third, store the results
    store_matches[str(one_row.CCN)] = same_wards_sametime
    
## finally, concatenate results into one df
all_matches = pd.concat(store_matches)
all_matches.head()

Unnamed: 0,Unnamed: 1,X,Y,CCN,REPORT_DAT,SHIFT,METHOD,OFFENSE,BLOCK,XBLOCK,YBLOCK,...,CENSUS_TRACT,VOTING_PRECINCT,LATITUDE,LONGITUDE,BID,START_DATE,END_DATE,OBJECTID,OCTO_RECORD_ID,report_dt
20165648,41,-76.997328,38.885141,20165798,2020/11/20 12:46:32+00,DAY,OTHERS,THEFT/OTHER,600 - 669 BLOCK OF PENNSYLVANIA AVENUE SE,400232.0,135255.0,...,6500.0,Precinct 89,38.885133,-76.997326,CAPITOL HILL,2020/11/19 23:43:15+00,,405554646,,2020-11-20 12:46:32+00:00
20165648,7028,-77.001316,38.898915,20165932,2020/11/20 18:56:18+00,DAY,OTHERS,THEFT F/AUTO,300 - 399 BLOCK OF G STREET NE,399886.0,136784.0,...,8301.0,Precinct 83,38.898907,-77.001314,,2020/11/20 15:30:02+00,2020/11/20 18:25:35+00,405825557,,2020-11-20 18:56:18+00:00
20165648,7365,-76.997316,38.904969,20165803,2020/11/20 14:45:06+00,DAY,OTHERS,THEFT F/AUTO,600 - 699 BLOCK OF ORLEANS PLACE NE,400233.0,137456.0,...,10602.0,Precinct 83,38.904961,-76.997314,,2020/11/19 23:45:48+00,2020/11/20 03:00:00+00,405826117,,2020-11-20 14:45:06+00:00
20165648,11720,-76.994365,38.900203,20165859,2020/11/20 15:37:59+00,DAY,OTHERS,THEFT/OTHER,800 - 899 BLOCK OF H STREET NE,400489.0,136927.0,...,8402.0,Precinct 82,38.900195,-76.994363,,2020/11/13 22:00:23+00,2020/11/14 00:00:13+00,405866472,,2020-11-20 15:37:59+00:00
20165648,20168,-77.005894,38.905167,20165986,2020/11/20 22:17:27+00,EVENING,OTHERS,MOTOR VEHICLE THEFT,1151 - 1199 BLOCK OF 1ST STREET NE,399489.0,137478.0,...,10603.0,Precinct 144,38.905159,-77.005891,NOMA,2020/11/20 20:15:26+00,2020/11/20 21:46:24+00,405913546,,2020-11-20 22:17:27+00:00


## 2. Function approach

Practice rewriting the above loop as a function

### 2.1 define the function

In [10]:
store_matches_2 = {}

def find_related_crimes(r): # imagine the function taking in one row as its sole variable
    ## first, subset to crimes in same ward
    same_wards = C_Oth[C_Oth.WARD == r.WARD]
    
    ## second, with those same-ward crimes, construct indicator for reported within 20 minutes
    ## (interpreting as after but could do either)
    ### substep: get time cutoff
    CUTOFF = r.report_dt +  timedelta(minutes=1200)
    
    ### substep: use that to subset
    same_wards_sametime = same_wards[(same_wards.report_dt >= r.report_dt) & 
                                    (same_wards.report_dt <= CUTOFF)].copy()
    
    ## third, store the results
    store_matches_2[str(r.CCN)] = same_wards_sametime

### 2.2 apply it to one of the focal crimes

In [11]:
find_related_crimes(C_Tar.iloc[0])
all_matches = pd.concat(store_matches_2)
all_matches

Unnamed: 0,Unnamed: 1,X,Y,CCN,REPORT_DAT,SHIFT,METHOD,OFFENSE,BLOCK,XBLOCK,YBLOCK,...,CENSUS_TRACT,VOTING_PRECINCT,LATITUDE,LONGITUDE,BID,START_DATE,END_DATE,OBJECTID,OCTO_RECORD_ID,report_dt
20123250,13714,-77.027565,38.897353,20123609,2020/08/30 00:05:52+00,EVENING,OTHERS,MOTOR VEHICLE THEFT,1100 - 1199 BLOCK OF F STREET NW,397609.0,136611.0,...,5802.0,Precinct 129,38.897346,-77.027563,DOWNTOWN,2020/08/29 23:08:57+00,,405870634,,2020-08-30 00:05:52+00:00
20123250,20603,-77.040091,38.909646,20123389,2020/08/29 16:05:18+00,DAY,OTHERS,THEFT F/AUTO,1700 - 1799 BLOCK OF P STREET NW,396523.0,137976.0,...,5303.0,Precinct 15,38.909638,-77.040089,,2020/08/28 22:00:23+00,2020/08/29 08:00:27+00,405999410,,2020-08-29 16:05:18+00:00
20123250,21350,-77.039824,38.905656,20123507,2020/08/29 22:04:46+00,EVENING,OTHERS,MOTOR VEHICLE THEFT,1700 - 1779 BLOCK OF M STREET NW,396546.0,137533.0,...,10700.0,Precinct 17,38.905648,-77.039822,GOLDEN TRIANGLE,2020/08/27 19:01:24+00,2020/08/29 19:00:05+00,406001970,,2020-08-29 22:04:46+00:00
20123250,24343,-77.021929,38.899129,20123419,2020/08/29 17:15:19+00,DAY,OTHERS,THEFT/OTHER,700 - 799 BLOCK OF 7TH STREET NW,398098.0,136808.0,...,5801.0,Precinct 129,38.899121,-77.021926,DOWNTOWN,2020/08/29 16:05:40+00,2020/08/29 16:08:33+00,406055554,,2020-08-29 17:15:19+00:00
20123250,26745,-77.050528,38.913354,20123422,2020/08/29 16:45:57+00,DAY,OTHERS,THEFT F/AUTO,2200 - 2399 BLOCK OF DECATUR PLACE NW,395618.0,138388.0,...,4100.0,Precinct 13,38.913346,-77.050526,,2020/08/26 22:00:29+00,2020/08/27 12:00:51+00,406070569,,2020-08-29 16:45:57+00:00
20123250,27629,-77.038491,38.913727,20401318,2020/08/29 14:29:59+00,DAY,OTHERS,THEFT/OTHER,1724 - 1799 BLOCK OF 17TH STREET NW,396662.0,138429.0,...,5302.0,Precinct 15,38.91372,-77.038489,,2020/08/28 20:55:00+00,2020/08/28 21:05:00+00,406076278,,2020-08-29 14:29:59+00:00


### 2.3 Use apply to cover all the other focal crimes

In [12]:
C_Tar.apply(find_related_crimes, axis=1)
all_matches = pd.concat(store_matches_2)
all_matches

25399    None
25672    None
dtype: object

Unnamed: 0,Unnamed: 1,X,Y,CCN,REPORT_DAT,SHIFT,METHOD,OFFENSE,BLOCK,XBLOCK,YBLOCK,...,CENSUS_TRACT,VOTING_PRECINCT,LATITUDE,LONGITUDE,BID,START_DATE,END_DATE,OBJECTID,OCTO_RECORD_ID,report_dt
20123250,13714,-77.027565,38.897353,20123609,2020/08/30 00:05:52+00,EVENING,OTHERS,MOTOR VEHICLE THEFT,1100 - 1199 BLOCK OF F STREET NW,397609.0,136611.0,...,5802.0,Precinct 129,38.897346,-77.027563,DOWNTOWN,2020/08/29 23:08:57+00,,405870634,,2020-08-30 00:05:52+00:00
20123250,20603,-77.040091,38.909646,20123389,2020/08/29 16:05:18+00,DAY,OTHERS,THEFT F/AUTO,1700 - 1799 BLOCK OF P STREET NW,396523.0,137976.0,...,5303.0,Precinct 15,38.909638,-77.040089,,2020/08/28 22:00:23+00,2020/08/29 08:00:27+00,405999410,,2020-08-29 16:05:18+00:00
20123250,21350,-77.039824,38.905656,20123507,2020/08/29 22:04:46+00,EVENING,OTHERS,MOTOR VEHICLE THEFT,1700 - 1779 BLOCK OF M STREET NW,396546.0,137533.0,...,10700.0,Precinct 17,38.905648,-77.039822,GOLDEN TRIANGLE,2020/08/27 19:01:24+00,2020/08/29 19:00:05+00,406001970,,2020-08-29 22:04:46+00:00
20123250,24343,-77.021929,38.899129,20123419,2020/08/29 17:15:19+00,DAY,OTHERS,THEFT/OTHER,700 - 799 BLOCK OF 7TH STREET NW,398098.0,136808.0,...,5801.0,Precinct 129,38.899121,-77.021926,DOWNTOWN,2020/08/29 16:05:40+00,2020/08/29 16:08:33+00,406055554,,2020-08-29 17:15:19+00:00
20123250,26745,-77.050528,38.913354,20123422,2020/08/29 16:45:57+00,DAY,OTHERS,THEFT F/AUTO,2200 - 2399 BLOCK OF DECATUR PLACE NW,395618.0,138388.0,...,4100.0,Precinct 13,38.913346,-77.050526,,2020/08/26 22:00:29+00,2020/08/27 12:00:51+00,406070569,,2020-08-29 16:45:57+00:00
20123250,27629,-77.038491,38.913727,20401318,2020/08/29 14:29:59+00,DAY,OTHERS,THEFT/OTHER,1724 - 1799 BLOCK OF 17TH STREET NW,396662.0,138429.0,...,5302.0,Precinct 15,38.91372,-77.038489,,2020/08/28 20:55:00+00,2020/08/28 21:05:00+00,406076278,,2020-08-29 14:29:59+00:00
20165648,41,-76.997328,38.885141,20165798,2020/11/20 12:46:32+00,DAY,OTHERS,THEFT/OTHER,600 - 669 BLOCK OF PENNSYLVANIA AVENUE SE,400232.0,135255.0,...,6500.0,Precinct 89,38.885133,-76.997326,CAPITOL HILL,2020/11/19 23:43:15+00,,405554646,,2020-11-20 12:46:32+00:00
20165648,7028,-77.001316,38.898915,20165932,2020/11/20 18:56:18+00,DAY,OTHERS,THEFT F/AUTO,300 - 399 BLOCK OF G STREET NE,399886.0,136784.0,...,8301.0,Precinct 83,38.898907,-77.001314,,2020/11/20 15:30:02+00,2020/11/20 18:25:35+00,405825557,,2020-11-20 18:56:18+00:00
20165648,7365,-76.997316,38.904969,20165803,2020/11/20 14:45:06+00,DAY,OTHERS,THEFT F/AUTO,600 - 699 BLOCK OF ORLEANS PLACE NE,400233.0,137456.0,...,10602.0,Precinct 83,38.904961,-76.997314,,2020/11/19 23:45:48+00,2020/11/20 03:00:00+00,405826117,,2020-11-20 14:45:06+00:00
20165648,11720,-76.994365,38.900203,20165859,2020/11/20 15:37:59+00,DAY,OTHERS,THEFT/OTHER,800 - 899 BLOCK OF H STREET NE,400489.0,136927.0,...,8402.0,Precinct 82,38.900195,-76.994363,,2020/11/13 22:00:23+00,2020/11/14 00:00:13+00,405866472,,2020-11-20 15:37:59+00:00
