In [182]:
import datetime
import itertools
import json
import numpy as np
import os
import pandas as pd
from pandas_helpers import *
import pickle
import pprint
import random
import re
import requests
import seaborn
import sys
import time

pd.set_option('display.precision', 10)

DATA_DIR = 'data/citydata/crime'

random.seed(90210)

In [183]:
from pyproj import Proj

In [184]:
def combine_dates(row):
    # assumes a date column existings, adds a start_date column with unix timestamp
    row['start_date'] = int((row['date'] - datetime.datetime(1970, 1, 1)).total_seconds() * 1000)
    row['date'] = '{d.year}-{d.month}-{d.day}'.format(d=row['date'])
    
    return row

# Boston Crime Data

## Boston Crime (Entity Analysis)

In [251]:
df = pd.read_csv(os.path.join(DATA_DIR, 'boston', 'boston_crime_2011_to_2014.csv'),
                 parse_dates=['FROMDATE'],
                 usecols=['COMPNOS', 'INCIDENT_TYPE_DESCRIPTION', 'FROMDATE', 'WEAPONTYPE', 'Location'])

In [252]:
df = df[df['FROMDATE'].dt.year >= 2012]
df.head()

Unnamed: 0,COMPNOS,INCIDENT_TYPE_DESCRIPTION,FROMDATE,WEAPONTYPE,Location
10877,142004841,FRAUD,2012-01-01,Unarmed,"(42.3594, -71.0587)"
10878,140244631,FRAUD,2012-01-01,Unarmed,"(42.36022134, -71.06596456)"
10879,140111460,FRAUD,2012-01-01,Unarmed,"(42.25185683, -71.1310008)"
10880,140056694,FRAUD,2012-01-01,Unarmed,"(42.37925634, -71.06021456)"
10881,130756502,FRAUD,2012-01-01,Unarmed,"(42.35057634, -71.07274456)"


In [253]:
df = df.set_index('COMPNOS')

df['FROMDATE'] = df['FROMDATE'].map(lambda dt: int((dt - datetime.datetime(1970, 1, 1)).total_seconds() * 1000))

In [254]:
df.head()

Unnamed: 0_level_0,INCIDENT_TYPE_DESCRIPTION,FROMDATE,WEAPONTYPE,Location
COMPNOS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
142004841,FRAUD,1325376000000,Unarmed,"(42.3594, -71.0587)"
140244631,FRAUD,1325376000000,Unarmed,"(42.36022134, -71.06596456)"
140111460,FRAUD,1325376000000,Unarmed,"(42.25185683, -71.1310008)"
140056694,FRAUD,1325376000000,Unarmed,"(42.37925634, -71.06021456)"
130756502,FRAUD,1325376000000,Unarmed,"(42.35057634, -71.07274456)"


In [255]:
def parse_loc(row):
    lat, lon = map(str.split, row['Location'].strip('()').split(','))
    
    row['lat'] = lat[0]
    row['lon'] = lon[0]
    
    return row

df = df.apply(parse_loc, axis=1)

In [256]:
df.head()

Unnamed: 0_level_0,INCIDENT_TYPE_DESCRIPTION,FROMDATE,WEAPONTYPE,Location,lat,lon
COMPNOS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
142004841,FRAUD,1325376000000,Unarmed,"(42.3594, -71.0587)",42.3594,-71.0587
140244631,FRAUD,1325376000000,Unarmed,"(42.36022134, -71.06596456)",42.36022134,-71.06596456
140111460,FRAUD,1325376000000,Unarmed,"(42.25185683, -71.1310008)",42.25185683,-71.1310008
140056694,FRAUD,1325376000000,Unarmed,"(42.37925634, -71.06021456)",42.37925634,-71.06021456
130756502,FRAUD,1325376000000,Unarmed,"(42.35057634, -71.07274456)",42.35057634,-71.07274456


In [257]:
len(df)

307446

In [258]:
df['WEAPONTYPE'].replace({'Other': 'Armed: Other',
                          'Unarmed': ''}, inplace=True)

In [259]:
df.head()

Unnamed: 0_level_0,INCIDENT_TYPE_DESCRIPTION,FROMDATE,WEAPONTYPE,Location,lat,lon
COMPNOS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
142004841,FRAUD,1325376000000,,"(42.3594, -71.0587)",42.3594,-71.0587
140244631,FRAUD,1325376000000,,"(42.36022134, -71.06596456)",42.36022134,-71.06596456
140111460,FRAUD,1325376000000,,"(42.25185683, -71.1310008)",42.25185683,-71.1310008
140056694,FRAUD,1325376000000,,"(42.37925634, -71.06021456)",42.37925634,-71.06021456
130756502,FRAUD,1325376000000,,"(42.35057634, -71.07274456)",42.35057634,-71.07274456


In [260]:
df['description'] = ''

def smush(row):
    row['description'] = '{} '.format(row['INCIDENT_TYPE_DESCRIPTION'])
    
    if row['WEAPONTYPE']:
        row['description'] += '[{}]'.format(row['WEAPONTYPE'])
        
    row['description'] = row['description'].strip()
    
    return row

df = df.apply(smush, axis=1)

In [261]:
try:
    df.drop(['INCIDENT_TYPE_DESCRIPTION', 'WEAPONTYPE', 'Location'],
           axis=1,
           inplace=True)
except ValueError:
    pass

In [262]:
df.head()

Unnamed: 0_level_0,FROMDATE,lat,lon,description
COMPNOS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
142004841,1325376000000,42.3594,-71.0587,FRAUD
140244631,1325376000000,42.36022134,-71.06596456,FRAUD
140111460,1325376000000,42.25185683,-71.1310008,FRAUD
140056694,1325376000000,42.37925634,-71.06021456,FRAUD
130756502,1325376000000,42.35057634,-71.07274456,FRAUD


In [263]:
df = df_rearrange_columns(df, 
                          ['start_date', 'latitude', 'longitude', 'description'],
                          ['start_date', 'latitude', 'longitude', 'description'])

In [264]:
df.head()

Unnamed: 0_level_0,start_date,latitude,longitude,description
COMPNOS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
142004841,1325376000000,42.3594,-71.0587,FRAUD
140244631,1325376000000,42.36022134,-71.06596456,FRAUD
140111460,1325376000000,42.25185683,-71.1310008,FRAUD
140056694,1325376000000,42.37925634,-71.06021456,FRAUD
130756502,1325376000000,42.35057634,-71.07274456,FRAUD


In [265]:
df_to_json_split_wo_index(df,
                          'boston_crime_entity_data.json',
                          ['Start Date (ms from epoch)',
                           'Latitude of Crime', 
                           'Longitude of Crime',
                           'Description of Crime (with Weapon Type)'])

In [266]:
len(df)

307446

## Boston Crime (Time Series)

In [332]:
df = pd.read_csv(os.path.join(DATA_DIR, 'boston', 'boston_crime_2011_to_2014.csv'), 
                 parse_dates=['FROMDATE'],
                 usecols=['COMPNOS', 'NatureCode', 'INCIDENT_TYPE_DESCRIPTION', 'MAIN_CRIMECODE', 'FROMDATE',
                          'Shooting', 'DOMESTIC', 'Location'],
                 index_col='COMPNOS')

In [333]:
df.head()

Unnamed: 0_level_0,NatureCode,INCIDENT_TYPE_DESCRIPTION,MAIN_CRIMECODE,FROMDATE,Shooting,DOMESTIC,Location
COMPNOS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
110650363,IVMV,Plates,Plates,2011-11-21 05:45:00,No,No,"(42.3594, -71.0587)"
110637957,VANRPT,VANDALISM,14xx,2011-11-21 05:45:00,No,No,"(42.34448635, -71.07217456)"
110638477,IVMV,LARCENY FROM MOTOR VEHICLE,06MV,2011-11-21 06:00:00,No,No,"(42.35246135, -71.12767954)"
110637659,ILLPRK,TOWED,TOWED,2011-11-21 06:00:00,No,No,"(42.33637135, -71.05237956)"
110637584,ARREST,VAL,VAL,2011-11-21 06:10:00,No,No,"(42.31535135, -71.07842956)"


In [334]:
# Shooting and domestic are the only columns that might show an interesting grouping...
# the others have 60+ unique values

In [335]:
df.drop(['INCIDENT_TYPE_DESCRIPTION', 'NatureCode', 'MAIN_CRIMECODE'], axis=1, inplace=True)

In [336]:
df = pd.read_csv(os.path.join(DATA_DIR, 'boston', 'boston_crime_2011_to_2014.csv'), 
                 parse_dates=['FROMDATE'],
                 usecols=['COMPNOS', 'FROMDATE', 'Shooting', 'DOMESTIC', 'Location'],
                 index_col='COMPNOS')

In [337]:
# only consider crimes taking place from 2012 onwards
df = df[df['FROMDATE'].dt.date >= datetime.date(2012, 01, 01)]

In [338]:
df.head(10)

Unnamed: 0_level_0,FROMDATE,Shooting,DOMESTIC,Location
COMPNOS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
142004841,2012-01-01,No,No,"(42.3594, -71.0587)"
140244631,2012-01-01,No,No,"(42.36022134, -71.06596456)"
140111460,2012-01-01,No,No,"(42.25185683, -71.1310008)"
140056694,2012-01-01,No,No,"(42.37925634, -71.06021456)"
130756502,2012-01-01,No,No,"(42.35057634, -71.07274456)"
130687431,2012-01-01,No,No,"(42.33342135, -71.08622455)"
130105439,2012-01-01,No,No,"(42.31586601, -71.08511121)"
130060448,2012-01-01,No,No,"(42.3594, -71.0587)"
120813267,2012-01-01,No,No,"(42.26408636, -71.10334456)"
120807305,2012-01-01,No,No,"(42.3594, -71.0587)"


In [339]:
# Overall statistics of dataset for later sanity checks

print '{:d} crimes reported.'.format(len(df))
print '{:d} are domestic, {:d} are shootings, {:d} are both.'.format(len(df[df['DOMESTIC'] == 'Yes']),
                                                                     len(df[df['Shooting'] == 'Yes']),
                                                                     len(df[(df['DOMESTIC'] == 'Yes') & 
                                                                            (df['Shooting'] == 'Yes')]))

307446 crimes reported.
46006 are domestic, 673 are shootings, 2 are both.


In [378]:
df.groupby([df['FROMDATE'].dt.year]).count()['FROMDATE']

2012    103837
2013    102578
2014    101031
Name: FROMDATE, dtype: int64

In [340]:
with pd.option_context('display.max_rows', 999):
    print df

                     FROMDATE Shooting DOMESTIC                     Location
COMPNOS                                                                     
142004841 2012-01-01 00:00:00       No       No          (42.3594, -71.0587)
140244631 2012-01-01 00:00:00       No       No  (42.36022134, -71.06596456)
140111460 2012-01-01 00:00:00       No       No   (42.25185683, -71.1310008)
140056694 2012-01-01 00:00:00       No       No  (42.37925634, -71.06021456)
130756502 2012-01-01 00:00:00       No       No  (42.35057634, -71.07274456)
130687431 2012-01-01 00:00:00       No       No  (42.33342135, -71.08622455)
130105439 2012-01-01 00:00:00       No       No  (42.31586601, -71.08511121)
130060448 2012-01-01 00:00:00       No       No          (42.3594, -71.0587)
120813267 2012-01-01 00:00:00       No       No  (42.26408636, -71.10334456)
120807305 2012-01-01 00:00:00       No       No          (42.3594, -71.0587)
120645422 2012-01-01 00:00:00       No       No    (42.27746141, -71.119355)

In [363]:
data = df.groupby([df['FROMDATE'].dt.year,
                   df['FROMDATE'].dt.month,
                   df['FROMDATE'].dt.day,
                   df['FROMDATE'].dt.hour]).count()['FROMDATE']

In [364]:
data = data.reset_index()
data.columns = ['Year', 'Month', 'Day', 'Hour', 'Events']

In [365]:
assert data['Events'].sum() == 307446 # Make sure events by hour is the same amount of events in the original dataset

In [366]:
data.head()

Unnamed: 0,Year,Month,Day,Hour,Events
0,2012,1,1,0,42
1,2012,1,1,1,32
2,2012,1,1,2,28
3,2012,1,1,3,21
4,2012,1,1,4,16


In [367]:
data['date'] = pd.to_datetime(data['Year'].astype(str) + ' ' + data['Month'].astype(str) + ' ' + data['Day'].astype(str) +
                             ' ' + data['Hour'].astype(str) + ':00:00')

In [368]:
data.head()

Unnamed: 0,Year,Month,Day,Hour,Events,date
0,2012,1,1,0,42,2012-01-01 00:00:00
1,2012,1,1,1,32,2012-01-01 01:00:00
2,2012,1,1,2,28,2012-01-01 02:00:00
3,2012,1,1,3,21,2012-01-01 03:00:00
4,2012,1,1,4,16,2012-01-01 04:00:00


In [369]:
try:
    data.drop(['Year', 'Month', 'Day', 'Hour'], axis=1, inplace=True)
except ValueError:
    pass

In [370]:
data.head()

Unnamed: 0,Events,date
0,42,2012-01-01 00:00:00
1,32,2012-01-01 01:00:00
2,28,2012-01-01 02:00:00
3,21,2012-01-01 03:00:00
4,16,2012-01-01 04:00:00


In [371]:
data = data.apply(combine_dates, axis=1)

In [372]:
data.head()

Unnamed: 0,Events,date,start_date
0,42,2012-1-1,1325376000000
1,32,2012-1-1,1325379600000
2,28,2012-1-1,1325383200000
3,21,2012-1-1,1325386800000
4,16,2012-1-1,1325390400000


In [373]:
data.columns = ['events', 'date', 'start_date']
data = data.ix[:, ['date', 'start_date', 'events']]
data.head()

Unnamed: 0,date,start_date,events
0,2012-1-1,1325376000000,42
1,2012-1-1,1325379600000,32
2,2012-1-1,1325383200000,28
3,2012-1-1,1325386800000,21
4,2012-1-1,1325390400000,16


In [376]:
assert data['events'].sum() == 307446

In [386]:
df_to_json_split_wo_index(data,
                          'boston_crime_data.json', 
                          ['Date of Event', 'Hour of Event (ms from epoch)', 'Number of Events'])

In [620]:
# Sanity Checks
with open('boston_crime_data.json', 'rb') as infile:
    data = json.load(infile)
    print sum([x[2] for x in data['data']])

307446


# DC Crime Data

## DC Crime (Entity Analysis)

In [833]:
df = df_orig = df_csv_from_dir(os.path.join(DATA_DIR, 'dc'), 
                               parse_dates=['START_DATE', 'END_DATE', 'REPORT_DAT'])

In [834]:
df.head()

Unnamed: 0,REPORT_DAT,SHIFT,OFFENSE,METHOD,BLOCK,DISTRICT,PSA,WARD,ANC,NEIGHBORHOOD_CLUSTER,BLOCK_GROUP,CENSUS_TRACT,VOTING_PRECINCT,CCN,XBLOCK,YBLOCK,START_DATE,END_DATE
0,2014-11-29 14:20:00,DAY,ROBBERY,GUN,2000 - 2069 BLOCK OF BRYANT STREET NE,5,505,5,5C,Cluster 22,011100 3,11100,Precinct 72,14184429,402097.35,139269.9,2014-11-28 18:30:00,2014-11-29 14:00:00
1,2015-01-19 14:46:00,DAY,ROBBERY,OTHERS,1700 - 1728 BLOCK OF MONTELLO AVENUE NE,5,506,5,5D,Cluster 23,008804 2,8804,Precinct 78,15009223,401248.0,137937.0,2015-01-19 14:34:00,2015-01-19 14:46:00
2,2014-08-19 20:09:00,EVENING,BURGLARY,OTHERS,1706 - 1799 BLOCK OF 2ND STREET NW,5,501,5,5E,Cluster 21,003400 2,3400,Precinct 19,14125361,398774.0,138427.0,2014-08-19 09:00:00,2014-08-19 20:09:00
3,2014-08-30 05:40:00,MIDNIGHT,THEFT/OTHER,OTHERS,3000 - 3133 BLOCK OF RHODE ISLAND AVENUE NE,5,503,5,5C,Cluster 24,011100 2,11100,Precinct 69,14131597,403053.0,140739.0,2014-08-30 04:51:00,2014-08-30 04:58:00
4,2014-07-02 02:56:00,MIDNIGHT,THEFT/OTHER,OTHERS,2500 - 2598 BLOCK OF BENNING ROAD NE,5,507,7,7D,Cluster 25,007903 1,7903,Precinct 80,14096750,402530.0,136674.0,2014-07-02 02:54:00,2014-07-02 02:54:00


In [836]:
pd.unique(df['OFFENSE'])

array(['ROBBERY', 'BURGLARY', 'THEFT/OTHER', 'THEFT F/AUTO',
       'MOTOR VEHICLE THEFT', 'ASSAULT W/DANGEROUS WEAPON', 'HOMICIDE',
       'SEX ABUSE', 'ARSON'], dtype=object)

## DC Crime (Time Series) 

In [713]:
df = df_orig = df_csv_from_dir(os.path.join(DATA_DIR, 'dc'), 
                               parse_dates=['START_DATE', 'END_DATE', 'REPORT_DAT'],
                               usecols=['START_DATE', 'END_DATE', 'REPORT_DAT', 'OFFENSE'])

In [714]:
df.head()

Unnamed: 0,REPORT_DAT,OFFENSE,START_DATE,END_DATE
0,2014-11-29 14:20:00,ROBBERY,2014-11-28 18:30:00,2014-11-29 14:00:00
1,2015-01-19 14:46:00,ROBBERY,2015-01-19 14:34:00,2015-01-19 14:46:00
2,2014-08-19 20:09:00,BURGLARY,2014-08-19 09:00:00,2014-08-19 20:09:00
3,2014-08-30 05:40:00,THEFT/OTHER,2014-08-30 04:51:00,2014-08-30 04:58:00
4,2014-07-02 02:56:00,THEFT/OTHER,2014-07-02 02:54:00,2014-07-02 02:54:00


- Drop all rows with a single empty value
- Drop all rows that start before 2012-01-01

In [715]:
def cleanup(df, datecols, datemin, dropna=True):
    rows = len(df)
    
    if dropna:
        df = df.dropna(how='any')
        
    for datecol in datecols:
        df = df[df[datecol].dt.date >= datemin]
    
    print 'Dropped {:d} rows'.format(rows - len(df))
    
    return df

In [716]:
df = cleanup(df, ['REPORT_DAT', 'START_DATE', 'END_DATE'], datetime.date(2012, 01, 01))

Dropped 27 rows


We're going off of the start date, so drop the other 2 date columns

In [717]:
try:
    df.drop(['REPORT_DAT', 'END_DATE'], axis=1, inplace=True)
except ValueError:
    pass

In [718]:
# Rename columns
df.columns = ['type', 'date']

In [719]:
df.head()

Unnamed: 0,type,date
0,ROBBERY,2014-11-28 18:30:00
1,ROBBERY,2015-01-19 14:34:00
2,BURGLARY,2014-08-19 09:00:00
3,THEFT/OTHER,2014-08-30 04:51:00
4,THEFT/OTHER,2014-07-02 02:54:00


In [720]:
# Overall statistics of dataset for later sanity checks
num_crimes = len(df)
print '{:d} crimes reported.'.format(num_crimes)
print '{:d} unique types of crimes.'.format(len(pd.unique(df['type'])))

print ''

print 'Breakdown of crimes by type:'
print df.groupby('type').count()['date']

54482 crimes reported.
9 unique types of crimes.

Breakdown of crimes by type:
type
ARSON                            34
ASSAULT W/DANGEROUS WEAPON     3415
BURGLARY                       4253
HOMICIDE                        166
MOTOR VEHICLE THEFT            4422
ROBBERY                        4777
SEX ABUSE                       423
THEFT F/AUTO                  16319
THEFT/OTHER                   20673
Name: date, dtype: int64


Strip off the minute and seconds, only care about the YMD/hour

In [721]:
df['date'] = pd.to_datetime(df['date'].map(lambda x: x.strftime('%Y-%m-%d %H:00:00')))

In [722]:
df.head()

Unnamed: 0,type,date
0,ROBBERY,2014-11-28 18:00:00
1,ROBBERY,2015-01-19 14:00:00
2,BURGLARY,2014-08-19 09:00:00
3,THEFT/OTHER,2014-08-30 04:00:00
4,THEFT/OTHER,2014-07-02 02:00:00


In [723]:
assert len(df) == num_crimes

In [724]:
df = pd.pivot_table(df, index='date', columns='type', aggfunc=len).fillna(0).reset_index()

In [725]:
df.head()

type,date,ARSON,ASSAULT W/DANGEROUS WEAPON,BURGLARY,HOMICIDE,MOTOR VEHICLE THEFT,ROBBERY,SEX ABUSE,THEFT F/AUTO,THEFT/OTHER
0,2012-01-17 21:00:00,0,0,0,0,0,0,0,1,0
1,2012-02-11 21:00:00,0,0,0,0,0,0,0,0,1
2,2012-03-13 09:00:00,0,0,0,0,0,0,0,0,1
3,2012-03-18 11:00:00,0,0,0,0,0,0,1,0,0
4,2012-03-31 15:00:00,0,0,0,0,0,0,0,1,0


In [726]:
df['total'] = df.sum(axis=1)
df.columns = map(str.lower, df.columns)

In [727]:
df.sort('total').tail()

Unnamed: 0,date,arson,assault w/dangerous weapon,burglary,homicide,motor vehicle theft,robbery,sex abuse,theft f/auto,theft/other,total
6276,2014-09-28 15:00:00,0,1,2,0,3,1,0,1,8,16
3642,2014-06-05 17:00:00,0,1,0,0,0,4,0,5,6,16
5434,2014-08-22 17:00:00,0,1,1,0,1,0,0,3,10,16
3220,2014-05-17 23:00:00,0,0,1,1,3,1,0,5,6,17
7032,2014-10-31 22:00:00,0,6,0,0,0,6,0,5,3,20


In [728]:
assert df['total'].sum() == num_crimes

In [729]:
df = df.apply(combine_dates, axis=1)

In [730]:
df.head()

Unnamed: 0,date,arson,assault w/dangerous weapon,burglary,homicide,motor vehicle theft,robbery,sex abuse,theft f/auto,theft/other,total,start_date
0,2012-1-17,0,0,0,0,0,0,0,1,0,1,1326834000000
1,2012-2-11,0,0,0,0,0,0,0,0,1,1,1328994000000
2,2012-3-13,0,0,0,0,0,0,0,0,1,1,1331629200000
3,2012-3-18,0,0,0,0,0,0,1,0,0,1,1332068400000
4,2012-3-31,0,0,0,0,0,0,0,1,0,1,1333206000000


In [731]:
df = df_rearrange_columns(df, 
                          ['date', 'arson', 'assault w/dangerous weapon', 'burglary',
                           'homicide', 'motor vehicle theft', 'robbery', 'sex abuse',
                           'theft f/auto', 'theft/other', 'total', 'start_date'],
                          ['date', 'start_date', 'arson', 'assault w/dangerous weapon', 'burglary',
                           'homicide', 'motor vehicle theft', 'robbery', 'sex abuse',
                           'theft f/auto', 'theft/other', 'total'])
df.head()

Unnamed: 0,date,start_date,arson,assault w/dangerous weapon,burglary,homicide,motor vehicle theft,robbery,sex abuse,theft f/auto,theft/other,total
0,2012-1-17,1326834000000,0,0,0,0,0,0,0,1,0,1
1,2012-2-11,1328994000000,0,0,0,0,0,0,0,0,1,1
2,2012-3-13,1331629200000,0,0,0,0,0,0,0,0,1,1
3,2012-3-18,1332068400000,0,0,0,0,0,0,1,0,0,1
4,2012-3-31,1333206000000,0,0,0,0,0,0,0,1,0,1


In [744]:
df.iloc[:, 2:] = df.iloc[:, 2:].astype(int)

In [753]:
df_to_json_split_wo_index(df, 
                          'dc_crime_data.json',
                          ['Date of Event', 
                           'Hour of Event (ms from epoch)', 
                           'Number of Arson Crimes',
                           'Number of Assault with a dangerous weapon Crimes',
                           'Number of Burglary Crimes',
                           'Number of Homicide Criems',
                           'Number of Motor Vehicle Theft Crimes',
                           'Number of Robbery Crimes',
                           'Number of Sex Abuse Crimes',
                           'Number of Theft of Items within a Vehicle Crimes',
                           'Number of Theft Crimes',
                           'Total Number of Crimes'])

In [754]:
# Sanity Checks
with open('dc_crime_data.json', 'rb') as infile:
    data = json.load(infile)
    print sum([x[11] for x in data['data']])

54482


In [755]:
len(df), len(pd.unique(df.start_date))

(12134, 12134)

# NYC Crime Data

## NYC Crime (Entity Analysis)

In [267]:
df = pd.read_csv(os.path.join(DATA_DIR, 'ny', 'stop_and_frisks.csv'),
                 parse_dates=['datestop'],
                 usecols=['datestop', 'timestop', 'crimsusp',
                          'arstmade', 'arstoffn', 'frisked', 'searched', 'contrabn',
                          'pistol', 'riflshot', 'asltweap', 'knifcuti', 'machgun', 'othrweap',
                          'sex', 'race', 'age', 'ht_feet', 'ht_inch', 'weight', 'haircolr', 'eyecolor', 'build', 
                          'stinter', 'crossst', 'xcoord', 'ycoord'])

In [268]:
with pd.option_context('display.max_columns', 999):
    print df.head()

  datestop  timestop       crimsusp arstmade arstoffn frisked searched  \
0  1012013       452  GRAND LARCENY        N                Y        N   
1  1012013       315  GRAND LARCENY        N                N        N   
2  1012013       300        ROBBERY        N                Y        Y   
3  1012013      1212        ROBBERY        N                N        N   
4  1012013      2300       BURGLARY        N                N        N   

  contrabn pistol riflshot asltweap knifcuti machgun othrweap sex race age  \
0        N      N        N        N        N       N        N   M    B  41   
1        N      N        N        N        N       N        N   M    A  32   
2        N      N        N        N        N       N        N   M    B  20   
3        N      N        N        N        N       N        N   M    B  57   
4        N      N        N        N        N       N        N   M    B  31   

   ht_feet  ht_inch  weight haircolr eyecolor build        stinter  \
0        5      

In [269]:
# Cleanup dates
def cleanup_time(t):
    t = str(t)
    l = len(t)
    
    assert l in (1, 2, 3, 4)
    
    if l == 1:
        return '00:0' + t
    elif l == 2:
        return '00:' + t
    elif l == 3:
        return '0%s:%s' % (t[0], t[1:])
    elif l == 4:
        return '%s:%s' % (t[0:2], t[2:])
    
def cleanup_date(d):
    d = str(d)
    l = len(d)
    
    assert l in (7, 8)
    
    if l == 7:
        return '0%s-%s-%s' % (d[0], d[1:3], d[3:])
    elif l == 8:
        return '%s-%s-%s' % (d[0:2], d[2:4], d[4:])
    
df['datestop'] = df['datestop'].map(cleanup_date)
df['timestop'] = df['timestop'].map(cleanup_time)

df['date'] = pd.to_datetime(df['datestop'].astype(str) + ' ' + df['timestop'].astype(str) + ':00')

try:
    df.drop(['datestop', 'timestop'], axis=1, inplace=True)
except ValueError:
    pass

In [270]:
df = df[~(df.isnull().any(axis=1))]

In [271]:
noloc_mask = (df['xcoord'].str.strip() == '') | (df['ycoord'].str.strip() == '')

len(df), len(df[noloc_mask])

df = df[~(noloc_mask)]
# only consider crimes with coordinates

In [272]:
len(df)

186557

In [273]:
epsg2263 = Proj(init='epsg:2263', preserve_units=True)

def project(row):
    lon, lat = epsg2263(int(row['xcoord']), 
                        int(row['ycoord']),
                        inverse=True)
    
    row['latitude'] = lat
    row['longitude'] = lon
    
    return row

df = df.apply(project, axis=1)

In [274]:
df.head()

Unnamed: 0,crimsusp,arstmade,arstoffn,frisked,searched,contrabn,pistol,riflshot,asltweap,knifcuti,...,haircolr,eyecolor,build,stinter,crossst,xcoord,ycoord,date,latitude,longitude
0,GRAND LARCENY,N,,Y,N,N,N,N,N,N,...,BK,BR,M,FULTON STREET,CHURCH STREET,981349,198508,2013-01-01 04:52:00,40.711534838,-74.010464128
1,GRAND LARCENY,N,,N,N,N,N,N,N,N,...,BK,BR,T,FULTON STREET,CHURCH STREET,981349,198508,2013-01-01 03:15:00,40.711534838,-74.010464128
2,ROBBERY,N,,Y,Y,N,N,N,N,N,...,BK,BR,Z,FULTON STREET,CHURCH STREET,981349,198508,2013-01-01 03:00:00,40.711534838,-74.010464128
3,ROBBERY,N,,N,N,N,N,N,N,N,...,BK,BR,T,READE STREET,HUDSON STREET,981698,200203,2013-01-01 12:12:00,40.716187316,-74.009205901
4,BURGLARY,N,,N,N,N,N,N,N,N,...,BR,BR,M,MERCER STREET,PRINCE STREET,984642,203280,2013-01-01 23:00:00,40.724633299,-73.998585748


In [275]:
df[['stinter', 'crossst', 'xcoord', 'ycoord', 'latitude', 'longitude']].to_csv('nyc_crime_geocoded_data.csv')

In [214]:
try:
    df.drop(['stinter', 'crossst'], axis=1, inplace=True)
    df.drop(['xcoord', 'ycoord'], axis=1, inplace=True)
except ValueError:
    pass

In [215]:
with pd.option_context('display.max_columns', 100):
    print df.head()

        crimsusp arstmade arstoffn frisked searched contrabn pistol riflshot  \
0  GRAND LARCENY        N                Y        N        N      N        N   
1  GRAND LARCENY        N                N        N        N      N        N   
2        ROBBERY        N                Y        Y        N      N        N   
3        ROBBERY        N                N        N        N      N        N   
4       BURGLARY        N                N        N        N      N        N   

  asltweap knifcuti machgun othrweap sex race age  ht_feet  ht_inch  weight  \
0        N        N       N        N   M    B  41        5        9     200   
1        N        N       N        N   M    A  32        6        0     150   
2        N        N       N        N   M    B  20        5        7     170   
3        N        N       N        N   M    B  57        5       11     150   
4        N        N       N        N   M    B  31        6        0     200   

  haircolr eyecolor build                dat

In [216]:
df.head()

Unnamed: 0,crimsusp,arstmade,arstoffn,frisked,searched,contrabn,pistol,riflshot,asltweap,knifcuti,...,age,ht_feet,ht_inch,weight,haircolr,eyecolor,build,date,latitude,longitude
0,GRAND LARCENY,N,,Y,N,N,N,N,N,N,...,41,5,9,200,BK,BR,M,2013-01-01 04:52:00,40.711534838,-74.010464128
1,GRAND LARCENY,N,,N,N,N,N,N,N,N,...,32,6,0,150,BK,BR,T,2013-01-01 03:15:00,40.711534838,-74.010464128
2,ROBBERY,N,,Y,Y,N,N,N,N,N,...,20,5,7,170,BK,BR,Z,2013-01-01 03:00:00,40.711534838,-74.010464128
3,ROBBERY,N,,N,N,N,N,N,N,N,...,57,5,11,150,BK,BR,T,2013-01-01 12:12:00,40.716187316,-74.009205901
4,BURGLARY,N,,N,N,N,N,N,N,N,...,31,6,0,200,BR,BR,M,2013-01-01 23:00:00,40.724633299,-73.998585748


In [217]:
mapping = {'frisked': 'frisked',
           'searched': 'searched',
           'contrabn': 'contraband',
           'pistol': 'pistol',
           'riflshot': 'rifle',
           'asltweap': 'assault weapon',
           'knifcuti': 'knife',
           'machgun': 'machine gun',
           'othrweap': 'weapon'}

def flags_description(row):
    flags = []
    
    for k, v in mapping.iteritems():
        if row[k] == 'Y':
            flags.append(v)
            
    row['flags'] = ', '.join(flags)
    
    return row   

In [218]:
df = df.apply(flags_description, axis=1)

for col in mapping.keys():
    try:
        df.drop(col, axis=1, inplace=True)
    except ValueError:
        pass

df.head()

Unnamed: 0,crimsusp,arstmade,arstoffn,sex,race,age,ht_feet,ht_inch,weight,haircolr,eyecolor,build,date,latitude,longitude,flags
0,GRAND LARCENY,N,,M,B,41,5,9,200,BK,BR,M,2013-01-01 04:52:00,40.711534838,-74.010464128,frisked
1,GRAND LARCENY,N,,M,A,32,6,0,150,BK,BR,T,2013-01-01 03:15:00,40.711534838,-74.010464128,
2,ROBBERY,N,,M,B,20,5,7,170,BK,BR,Z,2013-01-01 03:00:00,40.711534838,-74.010464128,"frisked, searched"
3,ROBBERY,N,,M,B,57,5,11,150,BK,BR,T,2013-01-01 12:12:00,40.716187316,-74.009205901,
4,BURGLARY,N,,M,B,31,6,0,200,BR,BR,M,2013-01-01 23:00:00,40.724633299,-73.998585748,


In [219]:
race_map = {'B': 'Black',
            'A': 'Asian',
            'Q': 'White-Hispanic',
            'W': 'White',
            'P': 'Black-Hispanic',
            'I': 'American Indian'}

build_map = {'H': 'Heavy',
             'M': 'Medium',
             'T': 'Thin',
             'U': 'Muscular'}

hair_map = {'BR': 'Brown',
                    'BK': 'Black',
                    'BL': 'Blonde',
                    'SP': 'Salt and Pepper',
                    'BA': 'Bald',
                    'GY': 'Gray',
                    'DY': 'Dyed',
                    'SN': 'Sandy'}

eye_map  = {'BR': 'Brown',
                    'BK': 'Black',
                    'BL': 'Blue',
                    'GR': 'Green',
                    'HA': 'Hazel',
                    'SP': 'Salt and Pepper',
                    'GY': 'Gray'}

def person_description(row):
    desc = ''
    
    if row['age']:
        desc += '{} year old '.format(row['age'])
        
    if row['build'] in build_map:
        desc += '({} build) '.format(build_map[row['build']])
        
    if row['race'] and row['race'] in race_map:
        desc += '{} '.format(race_map[row['race']])
    
    if row['sex'] in ('M', 'F'):
        desc += '{},'.format('male' if row['sex'] == 'M' else 'female')
        
    if row['ht_feet'] and row['ht_inch']:
        desc += ' {}\' {}", '.format(row['ht_feet'],
                                     row['ht_inch'])
        
    if row['weight']:
        desc += '{} lb, '.format(row['weight'])
        
    if row['haircolr'] and row['haircolr'] in hair_map:
        desc += '{} hair, '.format(hair_map[row['haircolr']])
            
    if row['eyecolor'] and row['eyecolor'] in eye_map:
        desc += '{} eyes, '.format(eye_map[row['eyecolor']])
            
    row['description'] = desc.strip(' ').rstrip(', ')
    
    return row

In [220]:
df = df.apply(person_description, axis=1)

In [221]:
for col in ['age', 'build', 'race', 'sex', 'ht_feet', 'ht_inch', 'weight', 'haircolr', 'eyecolor']:
    try:
        df.drop(col, axis=1, inplace=True)
    except ValueError:
        pass

df.head()

Unnamed: 0,crimsusp,arstmade,arstoffn,date,latitude,longitude,flags,description
0,GRAND LARCENY,N,,2013-01-01 04:52:00,40.711534838,-74.010464128,frisked,"41 year old (Medium build) Black male, 5' 9"", ..."
1,GRAND LARCENY,N,,2013-01-01 03:15:00,40.711534838,-74.010464128,,"32 year old (Thin build) Asian male,150 lb, Bl..."
2,ROBBERY,N,,2013-01-01 03:00:00,40.711534838,-74.010464128,"frisked, searched","20 year old Black male, 5' 7"", 170 lb, Black h..."
3,ROBBERY,N,,2013-01-01 12:12:00,40.716187316,-74.009205901,,"57 year old (Thin build) Black male, 5' 11"", 1..."
4,BURGLARY,N,,2013-01-01 23:00:00,40.724633299,-73.998585748,,"31 year old (Medium build) Black male,200 lb, ..."


In [222]:
def final_description(row):
    desc = row['description']
    s = ''
    
    if row['arstmade'] == 'Y':
        s += 'Suspected {} , arrested for {}. '.format(row['crimsusp'],
                                                       row['arstoffn'])
    else:
        s += 'Suspected {}. '.format(row['crimsusp'])
        
    if row['description']:
        s += '{}. '.format(row['description'])
        
    if row['flags']:
        s += '[{}]'.format(row['flags'])
        
    row['description'] = s.strip()
        
    return row

In [223]:
df = df.apply(final_description, axis=1)
df.head()

Unnamed: 0,crimsusp,arstmade,arstoffn,date,latitude,longitude,flags,description
0,GRAND LARCENY,N,,2013-01-01 04:52:00,40.711534838,-74.010464128,frisked,Suspected GRAND LARCENY. 41 year old (Medium b...
1,GRAND LARCENY,N,,2013-01-01 03:15:00,40.711534838,-74.010464128,,Suspected GRAND LARCENY. 32 year old (Thin bui...
2,ROBBERY,N,,2013-01-01 03:00:00,40.711534838,-74.010464128,"frisked, searched","Suspected ROBBERY. 20 year old Black male, 5' ..."
3,ROBBERY,N,,2013-01-01 12:12:00,40.716187316,-74.009205901,,Suspected ROBBERY. 57 year old (Thin build) Bl...
4,BURGLARY,N,,2013-01-01 23:00:00,40.724633299,-73.998585748,,Suspected BURGLARY. 31 year old (Medium build)...


In [228]:
for col in ['crimsusp', 'arstmade', 'arstoffn', 'ht_inch', 'flags']:
    try:
        df.drop(col, axis=1, inplace=True)
    except ValueError:
        pass

df

Unnamed: 0,date,latitude,longitude,description
0,2013-01-01 04:52:00,40.711534838,-74.010464128,Suspected GRAND LARCENY. 41 year old (Medium b...
1,2013-01-01 03:15:00,40.711534838,-74.010464128,Suspected GRAND LARCENY. 32 year old (Thin bui...
2,2013-01-01 03:00:00,40.711534838,-74.010464128,"Suspected ROBBERY. 20 year old Black male, 5' ..."
3,2013-01-01 12:12:00,40.716187316,-74.009205901,Suspected ROBBERY. 57 year old (Thin build) Bl...
4,2013-01-01 23:00:00,40.724633299,-73.998585748,Suspected BURGLARY. 31 year old (Medium build)...
7,2013-01-01 17:45:00,40.714452833,-73.993878524,"Suspected BURGLARY , arrested for 140.15. 40 y..."
8,2013-01-01 02:00:00,40.728887678,-73.999271181,Suspected FELONY. 40 year old (Thin build) Bla...
9,2013-01-01 04:05:00,40.741521565,-74.007036964,Suspected FELONY. 25 year old (Medium build) m...
10,2013-01-01 00:30:00,40.710683421,-73.984702586,Suspected MISD. 40 year old (Medium build) Whi...
11,2013-01-01 01:40:00,40.712372026,-73.989936131,Suspected MISD. 23 year old (Thin build) Black...


In [229]:
df['date'] = df['date'].map(lambda dt: int((dt - datetime.datetime(1970, 1, 1)).total_seconds() * 1000))

In [230]:
df.head()

Unnamed: 0,date,latitude,longitude,description
0,1357015920000,40.711534838,-74.010464128,Suspected GRAND LARCENY. 41 year old (Medium b...
1,1357010100000,40.711534838,-74.010464128,Suspected GRAND LARCENY. 32 year old (Thin bui...
2,1357009200000,40.711534838,-74.010464128,"Suspected ROBBERY. 20 year old Black male, 5' ..."
3,1357042320000,40.716187316,-74.009205901,Suspected ROBBERY. 57 year old (Thin build) Bl...
4,1357081200000,40.724633299,-73.998585748,Suspected BURGLARY. 31 year old (Medium build)...


In [231]:
df = df_rearrange_columns(df,
                          ['date', 'latitude', 'longitude', 'description'],
                          ['date', 'latitude', 'longitude', 'description'])
df.head()

Unnamed: 0,date,latitude,longitude,description
0,1357015920000,40.711534838,-74.010464128,Suspected GRAND LARCENY. 41 year old (Medium b...
1,1357010100000,40.711534838,-74.010464128,Suspected GRAND LARCENY. 32 year old (Thin bui...
2,1357009200000,40.711534838,-74.010464128,"Suspected ROBBERY. 20 year old Black male, 5' ..."
3,1357042320000,40.716187316,-74.009205901,Suspected ROBBERY. 57 year old (Thin build) Bl...
4,1357081200000,40.724633299,-73.998585748,Suspected BURGLARY. 31 year old (Medium build)...


In [232]:
df_to_json_split_wo_index(df,
                          'nyc_crime_entity_data.json',
                          ['Date of Event (ms from epoch)',
                           'Latitude of Event',
                           'Longitude of Event',
                           'Description'])

## NYC Crime (Time Series Analysis)

In [602]:
df = pd.read_csv(os.path.join(DATA_DIR, 'ny', 'stop_and_frisks.csv'),
                 parse_dates=['datestop'],
                 usecols=['year', 'datestop', 'timestop'],
                 encoding='utf-8')

In [603]:
with pd.option_context('display.max_columns', 1000):
    print df.head()

   year datestop  timestop
0  2013  1012013       452
1  2013  1012013       315
2  2013  1012013       300
3  2013  1012013      1212
4  2013  1012013      2300


In [604]:
# Overall statistics of dataset for later sanity checks
num_crimes = len(df)
print '{:d} crimes reported.'.format(num_crimes)

191851 crimes reported.


In [605]:
def cleanup_time(t):
    t = str(t)
    l = len(t)
    
    assert l in (1, 2, 3, 4)
    
    if l == 1:
        return '00:0' + t
    elif l == 2:
        return '00:' + t
    elif l == 3:
        return '0%s:%s' % (t[0], t[1:])
    elif l == 4:
        return '%s:%s' % (t[0:2], t[2:])
    
def cleanup_date(d):
    d = str(d)
    l = len(d)
    
    assert l in (7, 8)
    
    if l == 7:
        return '0%s-%s-%s' % (d[0], d[1:3], d[3:])
    elif l == 8:
        return '%s-%s-%s' % (d[0:2], d[2:4], d[4:])

In [606]:
df['datestop'] = df['datestop'].map(cleanup_date)
df['timestop'] = df['timestop'].map(cleanup_time)

In [607]:
try:
    df.drop('year', axis=1, inplace=True)
except ValueError:
    pass

In [608]:
df.head()

Unnamed: 0,datestop,timestop
0,01-01-2013,04:52
1,01-01-2013,03:15
2,01-01-2013,03:00
3,01-01-2013,12:12
4,01-01-2013,23:00


In [609]:
df['date'] = pd.to_datetime(df['datestop'].astype(str) + ' ' + df['timestop'].astype(str) + ':00')

In [610]:
try:
    df.drop(['datestop', 'timestop'], axis=1, inplace=True)
except ValueError:
    pass

In [611]:
df.head()

Unnamed: 0,date
0,2013-01-01 04:52:00
1,2013-01-01 03:15:00
2,2013-01-01 03:00:00
3,2013-01-01 12:12:00
4,2013-01-01 23:00:00


In [612]:
df['date'] = df['date'].map(lambda x: x.strftime('%Y-%m-%d %H:00:00'))
df['date'] = pd.to_datetime(df['date'])

In [613]:
df.head()

Unnamed: 0,date
0,2013-01-01 04:00:00
1,2013-01-01 03:00:00
2,2013-01-01 03:00:00
3,2013-01-01 12:00:00
4,2013-01-01 23:00:00


In [614]:
df['start_date'] = 0 # not sure why this is required right now
df = df.apply(combine_dates, axis=1)
df.head()

Unnamed: 0,date,start_date
0,2013-1-1,1357012800000
1,2013-1-1,1357009200000
2,2013-1-1,1357009200000
3,2013-1-1,1357041600000
4,2013-1-1,1357081200000


In [615]:
data = df.groupby(['start_date', 'date']).size().reset_index()
data.head()

Unnamed: 0,start_date,date,0
0,1356998400000,2013-1-1,69
1,1357002000000,2013-1-1,103
2,1357005600000,2013-1-1,62
3,1357009200000,2013-1-1,62
4,1357012800000,2013-1-1,42


In [616]:
data = df_rearrange_columns(data,
                            ['start_date', 'date', 'events'],
                            ['date', 'start_date', 'events'])

In [617]:
data.head()

Unnamed: 0,date,start_date,events
0,2013-1-1,1356998400000,69
1,2013-1-1,1357002000000,103
2,2013-1-1,1357005600000,62
3,2013-1-1,1357009200000,62
4,2013-1-1,1357012800000,42


In [618]:
assert data['events'].sum() == num_crimes

In [619]:
df_to_json_split_wo_index(data, 
                          'nyc_crime_data.json',
                          ['Date of Event', 'Hour of Event (ms from epoch)', 'Number of Events'])