In [1]:
import codecs, json
import dask.dataframe as dd
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
# pyarrow parquet write test
df = pd.DataFrame({'one': [-1, np.nan, 2.5],
              'two': ['foo', 'bar', 'baz'],
              'three': [True, False, True]})
table = pa.Table.from_pandas(df)
pq.write_table(table, '../raw_data/example.parquet')

In [3]:
# pyarrow parquet read test
table2 = pq.read_table('../raw_data/example.parquet')
df2 = table2.to_pandas()
df2

Unnamed: 0,one,three,two
0,-1.0,True,foo
1,,False,bar
2,2.5,True,baz


In [4]:
%%time
# set data file path
parquet_data_folder = '../data/crimes-2001-to-present.snappy.parq' #crimes-2017.snappy.parq'
print('Loading crime data from: {}'.format(parquet_data_folder))

# load crimes parquet data into dask df
crimes = dd.read_parquet(parquet_data_folder, index='Date')

# load all data into memory
crimes = crimes.persist()
print('Crime data loaded into memory.')

# log records count and data frame stats
print('Crime data stats:')
print('---------------------------------------')
print('{:,} total records in {} partitions'.format(len(crimes), crimes.npartitions))
print('DataFrame size: {:,}'.format(crimes.size.compute()))

Loading crime data from: ../data/crimes-2001-to-present.snappy.parq
Crime data loaded into memory.
Crime data stats:
---------------------------------------
6,401,616 total records in 1 partitions
DataFrame size: 89,622,624
Wall time: 12.9 s


In [6]:
# get crime geo data for mapping homicides
crime_geo = crimes[['PrimaryType',
                    'Block',
                    'Description',
                    'LocationDescription',
                    'CommunityArea',
                    'Arrest',
                    'Domestic',
                    'Latitude', 
                    'Longitude']].dropna()

# get homicides
homicides = crime_geo[(crime_geo['PrimaryType']=='HOMICIDE')].compute()
print('Chicago homicides data preview:')
print('--------------------------------------------------------------------------')
print(homicides.head())
print('...')
print('Total Homicides:', len(homicides))

Chicago homicides data preview:
--------------------------------------------------------------------------
                    PrimaryType                  Block          Description  \
Date                                                                          
2001-01-01 10:40:00    HOMICIDE      024XX W MONROE ST  FIRST DEGREE MURDER   
2001-01-01 15:10:00    HOMICIDE  023XX N MILWAUKEE AVE  FIRST DEGREE MURDER   
2001-01-06 08:54:00    HOMICIDE    017XX S ASHLAND AVE  FIRST DEGREE MURDER   
2001-01-06 13:30:00    HOMICIDE      117XX S MORGAN ST  FIRST DEGREE MURDER   
2001-01-06 23:38:00    HOMICIDE  058XX W FULLERTON AVE  FIRST DEGREE MURDER   

                    LocationDescription CommunityArea  Arrest  Domestic  \
Date                                                                      
2001-01-01 10:40:00       CHA STAIRWELL          28.0   False     False   
2001-01-01 15:10:00                AUTO          22.0   False     False   
2001-01-06 08:54:00                AUTO

In [8]:
# get homicides coordinates for heatmap data
homicides_geo = homicides[['Latitude', 'Longitude']].values.tolist() # to_records()
print(homicides_geo[0:5])

[[41.880224549, -87.68824895200001], [41.924488265, -87.699933231], [41.858473695, -87.666167161], [41.680743671, -87.646876459], [41.923901330999996, -87.771156586]]


In [11]:
#json.dumps(homicides_geo)

In [12]:
# create homicides json data file for the map
file_path = '../data/chicago-homicides.json'
json.dump(homicides_geo, 
          codecs.open(file_path, 'w', encoding='utf-8'), 
          separators=(',', ':'), sort_keys=False, indent=0)