In [86]:
import datetime
import itertools
import json
import numpy as np
import os
import pandas as pd
from pandas_helpers import *
import pickle
import pprint
import random
import re
import requests
import seaborn
import sys
import time

pd.set_option('display.precision', 10)

DATA_DIR = 'data/citydata/crime'

random.seed(90210)

In [87]:
from pyproj import Proj

In [88]:
data_files = [os.path.join(DATA_DIR, 'dc', x) \
              for x in os.listdir(os.path.join(DATA_DIR, 'dc')) \
              if x.endswith('.csv')]

df = df_orig = df_from_files(data_files,
                             parse_dates=['START_DATE', 'END_DATE', 'REPORT_DAT'],
                             usecols=['START_DATE', 'END_DATE', 'REPORT_DAT', 'OFFENSE', 'METHOD', 'BLOCK', 'XBLOCK', 'YBLOCK'])

In [89]:
df.head()

Unnamed: 0,REPORT_DAT,OFFENSE,METHOD,BLOCK,XBLOCK,YBLOCK,START_DATE,END_DATE
0,2014-11-29 14:20:00,ROBBERY,GUN,2000 - 2069 BLOCK OF BRYANT STREET NE,402097.35,139269.9,2014-11-28 18:30:00,2014-11-29 14:00:00
1,2015-01-19 14:46:00,ROBBERY,OTHERS,1700 - 1728 BLOCK OF MONTELLO AVENUE NE,401248.0,137937.0,2015-01-19 14:34:00,2015-01-19 14:46:00
2,2014-08-19 20:09:00,BURGLARY,OTHERS,1706 - 1799 BLOCK OF 2ND STREET NW,398774.0,138427.0,2014-08-19 09:00:00,2014-08-19 20:09:00
3,2014-08-30 05:40:00,THEFT/OTHER,OTHERS,3000 - 3133 BLOCK OF RHODE ISLAND AVENUE NE,403053.0,140739.0,2014-08-30 04:51:00,2014-08-30 04:58:00
4,2014-07-02 02:56:00,THEFT/OTHER,OTHERS,2500 - 2598 BLOCK OF BENNING ROAD NE,402530.0,136674.0,2014-07-02 02:54:00,2014-07-02 02:54:00


In [90]:
def cleanup(df, datecols, datemin, dropna=True):
    rows = len(df)
    
    if dropna:
        df = df.dropna(how='any')
        
    for datecol in datecols:
        df = df[df[datecol].dt.date >= datemin]
    
    print 'Dropped {:d} rows'.format(rows - len(df))
    
    return df

In [91]:
df = cleanup(df, ['REPORT_DAT', 'START_DATE', 'END_DATE'], datetime.date(2012, 01, 01))

Dropped 27 rows


In [92]:
df.head(), len(df)

(           REPORT_DAT      OFFENSE  METHOD  \
 0 2014-11-29 14:20:00      ROBBERY     GUN   
 1 2015-01-19 14:46:00      ROBBERY  OTHERS   
 2 2014-08-19 20:09:00     BURGLARY  OTHERS   
 3 2014-08-30 05:40:00  THEFT/OTHER  OTHERS   
 4 2014-07-02 02:56:00  THEFT/OTHER  OTHERS   
 
                                          BLOCK     XBLOCK    YBLOCK  \
 0        2000 - 2069 BLOCK OF BRYANT STREET NE  402097.35  139269.9   
 1      1700 - 1728 BLOCK OF MONTELLO AVENUE NE  401248.00  137937.0   
 2           1706 - 1799 BLOCK OF 2ND STREET NW  398774.00  138427.0   
 3  3000 - 3133 BLOCK OF RHODE ISLAND AVENUE NE  403053.00  140739.0   
 4         2500 - 2598 BLOCK OF BENNING ROAD NE  402530.00  136674.0   
 
            START_DATE            END_DATE  
 0 2014-11-28 18:30:00 2014-11-29 14:00:00  
 1 2015-01-19 14:34:00 2015-01-19 14:46:00  
 2 2014-08-19 09:00:00 2014-08-19 20:09:00  
 3 2014-08-30 04:51:00 2014-08-30 04:58:00  
 4 2014-07-02 02:54:00 2014-07-02 02:54:00  , 54482)

In [93]:
pd.unique(df['OFFENSE'])

array(['ROBBERY', 'BURGLARY', 'THEFT/OTHER', 'THEFT F/AUTO',
       'MOTOR VEHICLE THEFT', 'ASSAULT W/DANGEROUS WEAPON', 'HOMICIDE',
       'SEX ABUSE', 'ARSON'], dtype=object)

In [94]:
for col in ['REPORT_DAT', 'END_DATE']:
    try:
        df.drop(col, axis=1, inplace=True)
    except ValueError:
        pass

df.head()

Unnamed: 0,OFFENSE,METHOD,BLOCK,XBLOCK,YBLOCK,START_DATE
0,ROBBERY,GUN,2000 - 2069 BLOCK OF BRYANT STREET NE,402097.35,139269.9,2014-11-28 18:30:00
1,ROBBERY,OTHERS,1700 - 1728 BLOCK OF MONTELLO AVENUE NE,401248.0,137937.0,2015-01-19 14:34:00
2,BURGLARY,OTHERS,1706 - 1799 BLOCK OF 2ND STREET NW,398774.0,138427.0,2014-08-19 09:00:00
3,THEFT/OTHER,OTHERS,3000 - 3133 BLOCK OF RHODE ISLAND AVENUE NE,403053.0,140739.0,2014-08-30 04:51:00
4,THEFT/OTHER,OTHERS,2500 - 2598 BLOCK OF BENNING ROAD NE,402530.0,136674.0,2014-07-02 02:54:00


In [95]:
def description(row):
    s = ''
    
    if row['OFFENSE']:
        s += 'Arrest for {} '.format(row['OFFENSE'].lower())
    
    if row['METHOD'] and row['METHOD'] != 'OTHERS':
        s += '[{}]'.format(row['METHOD'].lower())
        
    row['description'] = s.strip()
    
    return row

In [96]:
df = df.apply(description, axis=1)
df.head()

Unnamed: 0,OFFENSE,METHOD,BLOCK,XBLOCK,YBLOCK,START_DATE,description
0,ROBBERY,GUN,2000 - 2069 BLOCK OF BRYANT STREET NE,402097.35,139269.9,2014-11-28 18:30:00,Arrest for robbery [gun]
1,ROBBERY,OTHERS,1700 - 1728 BLOCK OF MONTELLO AVENUE NE,401248.0,137937.0,2015-01-19 14:34:00,Arrest for robbery
2,BURGLARY,OTHERS,1706 - 1799 BLOCK OF 2ND STREET NW,398774.0,138427.0,2014-08-19 09:00:00,Arrest for burglary
3,THEFT/OTHER,OTHERS,3000 - 3133 BLOCK OF RHODE ISLAND AVENUE NE,403053.0,140739.0,2014-08-30 04:51:00,Arrest for theft/other
4,THEFT/OTHER,OTHERS,2500 - 2598 BLOCK OF BENNING ROAD NE,402530.0,136674.0,2014-07-02 02:54:00,Arrest for theft/other


In [97]:
esri_102685 = Proj('+proj=lcc +lat_1=38.3 +lat_2=39.45 +lat_0=37.66666666666666 +lon_0=-77 +x_0=399999.9999999999 +y_0=0 +ellps=GRS80 +datum=NAD83 +to_meter=0.3048006096012192 +no_defs')

def project(row):
    lon, lat = esri_102685(int(row['XBLOCK']), 
                           int(row['YBLOCK']),
                           inverse=True)
    
    row['latitude'] = lat
    row['longitude'] = lon
    
    return row

In [98]:
df = df.apply(project, axis=1)

In [99]:
for col in ['OFFENSE', 'METHOD', 'BLOCK', 'XBLOCK', 'YBLOCK']:
    try:
        df.drop(col, axis=1, inplace=True)
    except ValueError:
        pass

df.head()

Unnamed: 0,START_DATE,description,latitude,longitude
0,2014-11-28 18:30:00,Arrest for robbery [gun],38.921290501,-76.975818162
1,2015-01-19 14:34:00,Arrest for robbery,38.909293017,-76.985610941
2,2014-08-19 09:00:00,Arrest for burglary,38.913707135,-77.014136281
3,2014-08-30 04:51:00,Arrest for theft/other,38.934529918,-76.964787381
4,2014-07-02 02:54:00,Arrest for theft/other,38.897912713,-76.970834525


In [100]:
df['date'] = df['START_DATE'].map(lambda dt: int((dt - datetime.datetime(1970, 1, 1)).total_seconds() * 1000))

In [101]:
df.drop('START_DATE', axis=1, inplace=True)

In [102]:
df.head()

Unnamed: 0,description,latitude,longitude,date
0,Arrest for robbery [gun],38.921290501,-76.975818162,1417199400000
1,Arrest for robbery,38.909293017,-76.985610941,1421678040000
2,Arrest for burglary,38.913707135,-77.014136281,1408438800000
3,Arrest for theft/other,38.934529918,-76.964787381,1409374260000
4,Arrest for theft/other,38.897912713,-76.970834525,1404269640000


In [103]:
df = df_rearrange_columns(df,
                          ['description', 'latitude', 'longitude', 'date'],
                          ['date', 'latitude', 'longitude', 'description'])

In [104]:
df.head()

Unnamed: 0,date,latitude,longitude,description
0,1417199400000,38.921290501,-76.975818162,Arrest for robbery [gun]
1,1421678040000,38.909293017,-76.985610941,Arrest for robbery
2,1408438800000,38.913707135,-77.014136281,Arrest for burglary
3,1409374260000,38.934529918,-76.964787381,Arrest for theft/other
4,1404269640000,38.897912713,-76.970834525,Arrest for theft/other


In [105]:
len(df)

54482

In [106]:
df_to_json_split_wo_index(df,
                          'dc_crime_entity_data.json',
                          ['Date of Event (ms from epoch)',
                           'Latitude of Event',
                           'Longitude of Event',
                           'Description of Event'])