In [None]:
import numpy as np
import pandas as pd
import dask.dataframe as dd

In [None]:
# load csv data into dask df
file_name = '../raw_data/Crimes_-_2017.csv' # data/Chicago-crimes-2017.csv'
df = dd.read_csv(file_name, 
                 error_bad_lines=False,
                 assume_missing=True, # dtype={'Ward': int}) #dtype='str')
                 parse_dates=['Date'], infer_datetime_format=True)

In [None]:
%%time
# log records count and load data partitions
print("{:,} total records in {} partitions".format(len(df), df.npartitions))

In [None]:
# drop duplicates
df.drop_duplicates(subset=['ID', 'Case Number'], inplace=True)

In [None]:
%%time
# persist in memory
df = df.persist()
df.size.compute()

In [None]:
%%time
print("DataFrame size: {:,}".format(df.size.compute()))

In [None]:
# get top 2 records
df.head(2)

In [None]:
# get last 2 records
df.tail(2)

In [None]:
# strip out white space from column names
df = df.rename(columns={c: c.replace(' ', '') for c in df.columns})
df.head(2)

In [None]:
# list columns
df.columns

In [None]:
# infer data types
df.dtypes

In [None]:
def unique_column_values(df):
    for column in df.columns:
        print("{} | {} | {}".format(
            df[column].name,
            len(df[column].unique()),
            df[column].dtype))

In [None]:
%%time
# print unique column values counts
print("Name | Unique # | Type")
unique_column_values(df)

In [None]:
# reduce data set
select_columns = ['Date', 'Block', 'PrimaryType','Description', 'LocationDescription', 
                  'Arrest', 'Domestic', 'Latitude', 'Longitude']

In [None]:
df = df[select_columns]
print("{:,} total records".format(len(df)))
df.head(2)

In [None]:
# drop duplicates
df = df.drop_duplicates() #.dropna()
print("{:,} total records".format(len(df)))
df.head(2)

In [None]:
# count arrests
arrests_df = df[df.Arrest==True]
print("{:,} arrests".format(len(arrests_df)))
arrests_df.head()

In [None]:
# domestic violance
domestic_df = df[df.Domestic==True]
print("{:,} domestic violance reports".format(len(domestic_df)))
domestic_df.head()

In [None]:
%%time
# convert Date to pandas datetime format
meta = ('Date', 'datetime64')
def parse_dates(df):
    return pd.to_datetime(df['Date'], dayfirst=True, infer_datetime_format=True)
                          #format='%m/%d/%Y %I:%M:%S %p')
#df.Date = pd.to_datetime(df.Date, format='%m/%d/%Y %I:%M:%S %p')
res = df.map_partitions(parse_dates, meta=meta).compute()

In [None]:
df.info()

In [None]:
# convert Date to proper datetime type
df = df.astype({'Date': 'datetime64'})


In [None]:
df.info()

In [None]:
# save to parquet
df.to_parquet('../data/Chicago-crimes-2017.parq')

In [None]:
%%time
df2 = dd.read_parquet('../data/Chicago-crimes-2017.parq')
print(len(df2))

In [None]:
df2.info()

In [None]:
unique_column_values(df2)