In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd

In [2]:
# load csv data into dask df
file_name = '../raw_data/Crimes_-_2017.csv'
df = dd.read_csv(file_name, 
                 error_bad_lines=False,
                 assume_missing=True)

In [3]:
%%time
# log records count and load data partitions
print("{:,} total records in {} partitions".format(len(df), df.npartitions))

160,182 total records in 1 partitions
Wall time: 1.46 s


In [4]:
%%time
# drop duplicates
df.drop_duplicates(subset=['ID', 'Case Number'], inplace=True)

Wall time: 9.77 ms


Unnamed: 0_level_0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
,float64,object,object,object,object,object,object,object,bool,bool,float64,float64,float64,float64,object,float64,float64,float64,object,float64,float64,object
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [5]:
%%time
# persist in memory
df = df.persist()
df.size.compute()

Wall time: 1.08 s


In [6]:
%%time
print("DataFrame size: {:,}".format(df.size.compute()))

DataFrame size: 3,524,004
Wall time: 16.6 ms


In [7]:
# get top 2 records
df.head(2)

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,23507.0,JA385764,08/10/2017 04:04:00 PM,041XX S WALLACE ST,110,HOMICIDE,FIRST DEGREE MURDER,HOUSE,True,False,...,11.0,61.0,01A,1172991.0,1877172.0,2017.0,08/17/2017 03:54:34 PM,41.81839,-87.640922,"(41.818389748, -87.640921789)"
1,23506.0,JA384949,08/10/2017 02:22:00 AM,012XX N GREENVIEW AVE,110,HOMICIDE,FIRST DEGREE MURDER,AUTO,False,False,...,1.0,24.0,01A,1166164.0,1908438.0,2017.0,08/17/2017 03:54:34 PM,41.904335,-87.665073,"(41.904334636, -87.665072774)"


In [8]:
# get last 2 records
df.tail(2)

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
160180,10802618.0,JA100607,01/01/2017 01:00:00 PM,003XX S HOYNE AVE,820,THEFT,$500 AND UNDER,STREET,False,False,...,2.0,28.0,6,,,2017.0,01/08/2017 03:48:20 PM,,,
160181,10802303.0,JA101463,01/01/2017 07:00:00 PM,011XX E BOWEN AVE,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,4.0,36.0,7,,,2017.0,01/08/2017 03:48:20 PM,,,


In [9]:
# strip out white space from column names
df = df.rename(columns={c: c.replace(' ', '') for c in df.columns})
df.head(2)

Unnamed: 0,ID,CaseNumber,Date,Block,IUCR,PrimaryType,Description,LocationDescription,Arrest,Domestic,...,Ward,CommunityArea,FBICode,XCoordinate,YCoordinate,Year,UpdatedOn,Latitude,Longitude,Location
0,23507.0,JA385764,08/10/2017 04:04:00 PM,041XX S WALLACE ST,110,HOMICIDE,FIRST DEGREE MURDER,HOUSE,True,False,...,11.0,61.0,01A,1172991.0,1877172.0,2017.0,08/17/2017 03:54:34 PM,41.81839,-87.640922,"(41.818389748, -87.640921789)"
1,23506.0,JA384949,08/10/2017 02:22:00 AM,012XX N GREENVIEW AVE,110,HOMICIDE,FIRST DEGREE MURDER,AUTO,False,False,...,1.0,24.0,01A,1166164.0,1908438.0,2017.0,08/17/2017 03:54:34 PM,41.904335,-87.665073,"(41.904334636, -87.665072774)"


In [10]:
# list columns
df.columns

Index(['ID', 'CaseNumber', 'Date', 'Block', 'IUCR', 'PrimaryType',
       'Description', 'LocationDescription', 'Arrest', 'Domestic', 'Beat',
       'District', 'Ward', 'CommunityArea', 'FBICode', 'XCoordinate',
       'YCoordinate', 'Year', 'UpdatedOn', 'Latitude', 'Longitude',
       'Location'],
      dtype='object')

In [11]:
# infer data types
df.dtypes

ID                     float64
CaseNumber              object
Date                    object
Block                   object
IUCR                    object
PrimaryType             object
Description             object
LocationDescription     object
Arrest                    bool
Domestic                  bool
Beat                   float64
District               float64
Ward                   float64
CommunityArea          float64
FBICode                 object
XCoordinate            float64
YCoordinate            float64
Year                   float64
UpdatedOn               object
Latitude               float64
Longitude              float64
Location                object
dtype: object

In [12]:
def unique_column_values(df):
    for column in df.columns:
        print("{} | {} | {}".format(
            df[column].name,
            len(df[column].unique()),
            df[column].dtype))

In [13]:
%%time
# print unique column values counts
print("Name | Unique # | Type")
unique_column_values(df)

Name | Unique # | Type
ID | 160182 | float64
CaseNumber | 160152 | object
Date | 70364 | object
Block | 24966 | object
IUCR | 314 | object
PrimaryType | 32 | object
Description | 293 | object
LocationDescription | 122 | object
Arrest | 2 | bool
Domestic | 2 | bool
Beat | 274 | float64
District | 23 | float64
Ward | 51 | float64
CommunityArea | 78 | float64
FBICode | 26 | object
XCoordinate | 43368 | float64
YCoordinate | 57212 | float64
Year | 1 | float64
UpdatedOn | 411 | object
Latitude | 87855 | float64
Longitude | 87848 | float64
Location | 87866 | object
Wall time: 1.5 s


In [14]:
# reduce data set
select_columns = ['Date', 'Block', 'PrimaryType','Description', 'LocationDescription', 
                  'Arrest', 'Domestic', 'Latitude', 'Longitude']

In [15]:
df = df[select_columns]
print("{:,} total records".format(len(df)))
df.head(2)

160,182 total records


Unnamed: 0,Date,Block,PrimaryType,Description,LocationDescription,Arrest,Domestic,Latitude,Longitude
0,08/10/2017 04:04:00 PM,041XX S WALLACE ST,HOMICIDE,FIRST DEGREE MURDER,HOUSE,True,False,41.81839,-87.640922
1,08/10/2017 02:22:00 AM,012XX N GREENVIEW AVE,HOMICIDE,FIRST DEGREE MURDER,AUTO,False,False,41.904335,-87.665073


In [16]:
# drop duplicates
df = df.drop_duplicates() #.dropna()
print("{:,} total records".format(len(df)))
df.head(2)

159,919 total records


Unnamed: 0,Date,Block,PrimaryType,Description,LocationDescription,Arrest,Domestic,Latitude,Longitude
0,08/10/2017 04:04:00 PM,041XX S WALLACE ST,HOMICIDE,FIRST DEGREE MURDER,HOUSE,True,False,41.81839,-87.640922
1,08/10/2017 02:22:00 AM,012XX N GREENVIEW AVE,HOMICIDE,FIRST DEGREE MURDER,AUTO,False,False,41.904335,-87.665073


In [17]:
# count arrests
arrests_df = df[df.Arrest==True]
print("{:,} arrests".format(len(arrests_df)))
arrests_df.head()

30,819 arrests


Unnamed: 0,Date,Block,PrimaryType,Description,LocationDescription,Arrest,Domestic,Latitude,Longitude
0,08/10/2017 04:04:00 PM,041XX S WALLACE ST,HOMICIDE,FIRST DEGREE MURDER,HOUSE,True,False,41.81839,-87.640922
34,07/31/2017 08:30:00 PM,0000X S CICERO AVE,OTHER OFFENSE,OTHER VEHICLE OFFENSE,STREET,True,False,,
38,07/14/2017 07:51:00 PM,044XX W JACKSON BLVD,OTHER OFFENSE,OTHER VEHICLE OFFENSE,STREET,True,False,,
70,08/10/2017 09:09:00 PM,081XX W HIGGINS RD,PROSTITUTION,SOLICIT FOR BUSINESS,HOTEL/MOTEL,True,False,41.985421,-87.829757
146,08/10/2017 09:00:00 PM,055XX N KENMORE AVE,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,True,41.982517,-87.656584


In [18]:
# domestic violance
domestic_df = df[df.Domestic==True]
print("{:,} domestic crime reports".format(len(domestic_df)))
domestic_df.head()

26,261 domestic crime reports


Unnamed: 0,Date,Block,PrimaryType,Description,LocationDescription,Arrest,Domestic,Latitude,Longitude
2,08/04/2017 09:00:00 PM,017XX N MASON AVE,OTHER OFFENSE,HARASSMENT BY TELEPHONE,RESIDENCE,False,True,,
10,06/06/2017 08:32:00 AM,071XX S VINCENNES AVE,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,True,,
11,07/25/2017 03:00:00 PM,064XX S DR MARTIN LUTHER KING JR DR,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,False,True,,
18,08/08/2017 11:00:00 PM,059XX S TROY ST,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,,
36,08/02/2017 09:30:00 PM,001XX S HALSTED ST,BATTERY,DOMESTIC BATTERY SIMPLE,OTHER,False,True,,


In [19]:
df.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 9 entries, Date to Longitude
dtypes: object(5), bool(2), float64(2)