In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd

In [2]:
# load csv data into dask df
file_name = '../raw_data/Crimes_-_2017.csv'
df = dd.read_csv(file_name, 
                 error_bad_lines=False,
                 assume_missing=True)

In [3]:
%%time
# log records count and load data partitions
print("{:,} total records in {} partitions".format(len(df), df.npartitions))

157,802 total records in 1 partitions
Wall time: 1.07 s


In [4]:
%%time
# drop duplicates
df.drop_duplicates(subset=['ID', 'Case Number'], inplace=True)

Wall time: 9.77 ms


Unnamed: 0_level_0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
,float64,object,object,object,object,object,object,object,bool,bool,float64,float64,float64,float64,object,float64,float64,float64,object,float64,float64,object
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [5]:
%%time
# persist in memory
df = df.persist()
df.size.compute()

Wall time: 1.07 s


In [6]:
%%time
print("DataFrame size: {:,}".format(df.size.compute()))

DataFrame size: 3,471,644
Wall time: 17.6 ms


In [7]:
# get top 2 records
df.head(2)

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,11046781.0,JA381948,08/07/2017 12:00:00 PM,091XX S ESSEX AVE,560,ASSAULT,SIMPLE,APARTMENT,False,True,...,7.0,48.0,08A,1194423.0,1844888.0,2017.0,08/14/2017 04:00:25 PM,41.729299,-87.563364,"(41.729299431, -87.563363564)"
1,11046780.0,JA381931,08/07/2017 09:30:00 PM,078XX S KINGSTON AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,...,7.0,43.0,08B,1194550.0,1853469.0,2017.0,08/14/2017 04:00:25 PM,41.752843,-87.562617,"(41.752843296, -87.562616815)"


In [8]:
# get last 2 records
df.tail(2)

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
157800,10802618.0,JA100607,01/01/2017 01:00:00 PM,003XX S HOYNE AVE,820,THEFT,$500 AND UNDER,STREET,False,False,...,2.0,28.0,6,,,2017.0,01/08/2017 03:48:20 PM,,,
157801,10802303.0,JA101463,01/01/2017 07:00:00 PM,011XX E BOWEN AVE,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,4.0,36.0,7,,,2017.0,01/08/2017 03:48:20 PM,,,


In [9]:
# strip out white space from column names
df = df.rename(columns={c: c.replace(' ', '') for c in df.columns})
df.head(2)

Unnamed: 0,ID,CaseNumber,Date,Block,IUCR,PrimaryType,Description,LocationDescription,Arrest,Domestic,...,Ward,CommunityArea,FBICode,XCoordinate,YCoordinate,Year,UpdatedOn,Latitude,Longitude,Location
0,11046781.0,JA381948,08/07/2017 12:00:00 PM,091XX S ESSEX AVE,560,ASSAULT,SIMPLE,APARTMENT,False,True,...,7.0,48.0,08A,1194423.0,1844888.0,2017.0,08/14/2017 04:00:25 PM,41.729299,-87.563364,"(41.729299431, -87.563363564)"
1,11046780.0,JA381931,08/07/2017 09:30:00 PM,078XX S KINGSTON AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,...,7.0,43.0,08B,1194550.0,1853469.0,2017.0,08/14/2017 04:00:25 PM,41.752843,-87.562617,"(41.752843296, -87.562616815)"


In [10]:
# list columns
df.columns

Index(['ID', 'CaseNumber', 'Date', 'Block', 'IUCR', 'PrimaryType',
       'Description', 'LocationDescription', 'Arrest', 'Domestic', 'Beat',
       'District', 'Ward', 'CommunityArea', 'FBICode', 'XCoordinate',
       'YCoordinate', 'Year', 'UpdatedOn', 'Latitude', 'Longitude',
       'Location'],
      dtype='object')

In [11]:
# infer data types
df.dtypes

ID                     float64
CaseNumber              object
Date                    object
Block                   object
IUCR                    object
PrimaryType             object
Description             object
LocationDescription     object
Arrest                    bool
Domestic                  bool
Beat                   float64
District               float64
Ward                   float64
CommunityArea          float64
FBICode                 object
XCoordinate            float64
YCoordinate            float64
Year                   float64
UpdatedOn               object
Latitude               float64
Longitude              float64
Location                object
dtype: object

In [12]:
def unique_column_values(df):
    for column in df.columns:
        print("{} | {} | {}".format(
            df[column].name,
            len(df[column].unique()),
            df[column].dtype))

In [13]:
%%time
# print unique column values counts
print("Name | Unique # | Type")
unique_column_values(df)

Name | Unique # | Type
ID | 157802 | float64
CaseNumber | 157772 | object
Date | 69367 | object
Block | 24878 | object
IUCR | 314 | object
PrimaryType | 32 | object
Description | 293 | object
LocationDescription | 122 | object
Arrest | 2 | bool
Domestic | 2 | bool
Beat | 274 | float64
District | 23 | float64
Ward | 51 | float64
CommunityArea | 78 | float64
FBICode | 26 | object
XCoordinate | 43151 | float64
YCoordinate | 56811 | float64
Year | 1 | float64
UpdatedOn | 405 | object
Latitude | 86922 | float64
Longitude | 86915 | float64
Location | 86933 | object
Wall time: 1.48 s


In [14]:
# reduce data set
select_columns = ['Date', 'Block', 'PrimaryType','Description', 'LocationDescription', 
                  'Arrest', 'Domestic', 'Latitude', 'Longitude']

In [15]:
df = df[select_columns]
print("{:,} total records".format(len(df)))
df.head(2)

157,802 total records


Unnamed: 0,Date,Block,PrimaryType,Description,LocationDescription,Arrest,Domestic,Latitude,Longitude
0,08/07/2017 12:00:00 PM,091XX S ESSEX AVE,ASSAULT,SIMPLE,APARTMENT,False,True,41.729299,-87.563364
1,08/07/2017 09:30:00 PM,078XX S KINGSTON AVE,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,41.752843,-87.562617


In [16]:
# drop duplicates
df = df.drop_duplicates() #.dropna()
print("{:,} total records".format(len(df)))
df.head(2)

157,543 total records


Unnamed: 0,Date,Block,PrimaryType,Description,LocationDescription,Arrest,Domestic,Latitude,Longitude
0,08/07/2017 12:00:00 PM,091XX S ESSEX AVE,ASSAULT,SIMPLE,APARTMENT,False,True,41.729299,-87.563364
1,08/07/2017 09:30:00 PM,078XX S KINGSTON AVE,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,41.752843,-87.562617


In [17]:
# count arrests
arrests_df = df[df.Arrest==True]
print("{:,} arrests".format(len(arrests_df)))
arrests_df.head()

30,314 arrests


Unnamed: 0,Date,Block,PrimaryType,Description,LocationDescription,Arrest,Domestic,Latitude,Longitude
2,08/07/2017 09:15:00 PM,012XX W 79TH ST,ROBBERY,STRONGARM - NO WEAPON,PARK PROPERTY,True,False,41.750471,-87.65556
8,08/07/2017 11:09:00 PM,103XX S WENTWORTH AVE,WEAPONS VIOLATION,UNLAWFUL POSS OF HANDGUN,STREET,True,False,41.706647,-87.628261
9,08/07/2017 10:53:00 PM,050XX S LAFLIN ST,BATTERY,SIMPLE,OTHER,True,False,41.802233,-87.662372
12,08/07/2017 10:42:00 PM,003XX W GRAND AVE,NARCOTICS,POSS: HEROIN(WHITE),SIDEWALK,True,False,41.891553,-87.636537
13,08/07/2017 11:29:00 PM,056XX S BISHOP ST,NARCOTICS,MANU/DELIVER:CRACK,SIDEWALK,True,False,41.791601,-87.66088


In [18]:
# domestic violance
domestic_df = df[df.Domestic==True]
print("{:,} domestic crime reports".format(len(domestic_df)))
domestic_df.head()

25,873 domestic crime reports


Unnamed: 0,Date,Block,PrimaryType,Description,LocationDescription,Arrest,Domestic,Latitude,Longitude
0,08/07/2017 12:00:00 PM,091XX S ESSEX AVE,ASSAULT,SIMPLE,APARTMENT,False,True,41.729299,-87.563364
1,08/07/2017 09:30:00 PM,078XX S KINGSTON AVE,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,41.752843,-87.562617
4,08/07/2017 11:25:00 AM,069XX S STATE ST,OTHER OFFENSE,HARASSMENT BY TELEPHONE,STREET,False,True,41.768728,-87.624946
6,08/07/2017 12:00:00 PM,026XX S CALIFORNIA AVE,BATTERY,DOMESTIC BATTERY SIMPLE,STREET,False,True,41.844163,-87.695199
7,08/07/2017 09:00:00 AM,035XX N KOSTNER AVE,OTHER OFFENSE,TELEPHONE THREAT,RESIDENCE,False,True,41.945256,-87.737203


In [19]:
df.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 9 entries, Date to Longitude
dtypes: object(5), bool(2), float64(2)