In [1]:
import codecs, json
import dask.dataframe as dd
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
# pyarrow parquet write test
df = pd.DataFrame({'one': [-1, np.nan, 2.5],
              'two': ['foo', 'bar', 'baz'],
              'three': [True, False, True]})
table = pa.Table.from_pandas(df)
pq.write_table(table, '../raw_data/example.parquet')

In [3]:
# pyarrow parquet read test
table2 = pq.read_table('../raw_data/example.parquet')
df2 = table2.to_pandas()
df2

Unnamed: 0,one,three,two
0,-1.0,True,foo
1,,False,bar
2,2.5,True,baz


In [4]:
%%time
# set data file path
parquet_data_folder = '../data/crimes-2017.snappy.parq'
print('Loading crime data from: {}'.format(parquet_data_folder))

# load crimes parquet data into dask df
crimes = dd.read_parquet(parquet_data_folder, index='Date')

# load all data into memory
crimes = crimes.persist()
print('Crime data loaded into memory.')

# log records count and data frame stats
print('Crime data stats:')
print('---------------------------------------')
print('{:,} total records in {} partitions'.format(len(crimes), crimes.npartitions))
print('DataFrame size: {:,}'.format(crimes.size.compute()))

Loading crime data from: ../data/crimes-2017.snappy.parq
Crime data loaded into memory.
Crime data stats:
---------------------------------------
172,030 total records in 1 partitions
DataFrame size: 2,408,420
Wall time: 4.73 s


In [5]:
# get crime geo data for mapping homicides
crime_geo = crimes[['PrimaryType',
                    'Block',
                    'Description',
                    'LocationDescription',
                    'CommunityArea',
                    'Arrest',
                    'Domestic',
                    'Latitude', 
                    'Longitude']].dropna()

# get homicides
homicides = crime_geo[(crime_geo['PrimaryType']=='HOMICIDE')].compute()
print('Chicago homicides data preview:')
print('--------------------------------------------------------------------------')
print(homicides.head())
print('...')
print('Total Homicides:', len(homicides))

Chicago homicides data preview:
--------------------------------------------------------------------------
                    PrimaryType                 Block          Description  \
Date                                                                         
2017-01-01 05:19:00    HOMICIDE      046XX N BROADWAY  FIRST DEGREE MURDER   
2017-01-01 06:18:00    HOMICIDE     046XX W MONROE ST  FIRST DEGREE MURDER   
2017-01-02 09:14:00    HOMICIDE    025XX N LOWELL AVE  FIRST DEGREE MURDER   
2017-01-03 12:20:00    HOMICIDE   034XX W FULTON BLVD  FIRST DEGREE MURDER   
2017-01-03 23:52:00    HOMICIDE  032XX W LEXINGTON ST  FIRST DEGREE MURDER   

                    LocationDescription CommunityArea  Arrest  Domestic  \
Date                                                                      
2017-01-01 05:19:00              TAVERN           3.0    True     False   
2017-01-01 06:18:00              STREET          25.0   False     False   
2017-01-02 09:14:00              STREET       

In [6]:
# get homicides coordinates for heatmap data
homicides_geo = homicides[['Latitude', 'Longitude']].values.tolist() # to_records()
print(homicides_geo[0:5])

[[41.966081546999995, -87.657908498], [41.879290642, -87.74159851299999], [41.926840967, -87.735415625], [41.886340706999995, -87.711999596], [41.871868444, -87.706610311]]


In [7]:
#json.dumps(homicides_geo)
def to_json_file(file_path, data):
    json.dump(data, 
          codecs.open(file_path, 'w', encoding='utf-8'), 
          separators=(',', ':'), sort_keys=False, indent=0)

In [8]:
# create homicides json data file for the map
to_json_file('../data/chicago-homicides-2017.json', homicides_geo)

In [9]:
print('All Crimes:', len(crime_geo))

All Crimes: 165567


In [10]:
%%time
# output all crimes coordinates to see how large it gets in raw json
to_json_file('../data/chicago-crimes-2017.json', 
             crime_geo[['Latitude', 'Longitude']].compute().values.tolist())

Wall time: 4.38 s


In [12]:
# gets crime coordinates with date dask df for animated crimes map replay
crime_coordinates = crime_geo[['Latitude', 'Longitude']].
crime_coordinates

Unnamed: 0_level_0,Latitude,Longitude
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-01 00:00:00,float64,float64
2017-08-25 23:59:00,...,...


In [15]:
crime_coordinates.to_csv('../data/chicago-crimes-2017-*.csv')