In [1]:
import dask.dataframe as dd

In [9]:
# load csv data into dask df
file_name = '../data/Chicago-crimes-2017.csv'
df = dd.read_csv(file_name, error_bad_lines=True, assume_missing=True) # dtype={'Ward': int}) #dtype='str')

In [10]:
# log records count and load data partitions
print("{:,} total records in {} partitions".format(len(df), df.npartitions))

130,133 total records in 1 partitions


In [11]:
# drop duplicates
df.drop_duplicates(subset=['ID', 'Case Number'], inplace=True)

Unnamed: 0_level_0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
,float64,object,object,object,object,object,object,object,bool,bool,float64,float64,float64,float64,object,float64,float64,float64,object,float64,float64,object
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [12]:
# data size
df_size = df.size
df_size, type(df_size)

(dd.Scalar<size-ag..., dtype=int32>, dask.dataframe.core.Scalar)

In [13]:
%%time
df_size.compute()

Wall time: 828 ms


2862926

In [14]:
%%time
# persist in memory
df = df.persist()
df.size.compute()

Wall time: 876 ms


In [15]:
%%time
print("DataFrame size: {:,}".format(df.size.compute()))

DataFrame size: 2,862,926
Wall time: 15.6 ms


In [16]:
# get top 2 records
df.head(2)

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,11007009.0,JA335947,7/4/2017 23:50,005XX W PEARSON ST,460,BATTERY,SIMPLE,RESIDENTIAL YARD (FRONT/BACK),False,False,...,27.0,8.0,08B,1172479.0,1905946.0,2017.0,7/11/2017 15:48,41.897359,-87.64195,"(41.897359112, -87.641950014)"
1,11005864.0,JA334658,7/4/2017 23:46,053XX W HIRSCH ST,460,BATTERY,SIMPLE,SIDEWALK,False,False,...,37.0,25.0,08B,1140517.0,1908799.0,2017.0,7/11/2017 15:48,41.905835,-87.759273,"(41.905834871, -87.759273141)"


In [17]:
# get last 2 records
df.tail(2)

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
130131,10802448.0,JA101652,1/1/2017 0:00,006XX W 62ND ST,1310,CRIMINAL DAMAGE,TO PROPERTY,RESIDENCE-GARAGE,False,False,...,16.0,68.0,14,1173000.0,1863775.0,2017.0,2/14/2017 15:49,41.781627,-87.641284,"(41.781626794, -87.641284443)"
130132,10802691.0,JA101951,1/1/2017 0:00,035XX N CLARK ST,890,THEFT,FROM BUILDING,BAR OR TAVERN,False,False,...,44.0,6.0,6,,,2017.0,1/8/2017 15:48,,,


In [18]:
# strip out white space from column names
df = df.rename(columns={c: c.replace(' ', '') for c in df.columns})
df.head(2)

Unnamed: 0,ID,CaseNumber,Date,Block,IUCR,PrimaryType,Description,LocationDescription,Arrest,Domestic,...,Ward,CommunityArea,FBICode,XCoordinate,YCoordinate,Year,UpdatedOn,Latitude,Longitude,Location
0,11007009.0,JA335947,7/4/2017 23:50,005XX W PEARSON ST,460,BATTERY,SIMPLE,RESIDENTIAL YARD (FRONT/BACK),False,False,...,27.0,8.0,08B,1172479.0,1905946.0,2017.0,7/11/2017 15:48,41.897359,-87.64195,"(41.897359112, -87.641950014)"
1,11005864.0,JA334658,7/4/2017 23:46,053XX W HIRSCH ST,460,BATTERY,SIMPLE,SIDEWALK,False,False,...,37.0,25.0,08B,1140517.0,1908799.0,2017.0,7/11/2017 15:48,41.905835,-87.759273,"(41.905834871, -87.759273141)"


In [19]:
# list columns
df.columns

Index(['ID', 'CaseNumber', 'Date', 'Block', 'IUCR', 'PrimaryType',
       'Description', 'LocationDescription', 'Arrest', 'Domestic', 'Beat',
       'District', 'Ward', 'CommunityArea', 'FBICode', 'XCoordinate',
       'YCoordinate', 'Year', 'UpdatedOn', 'Latitude', 'Longitude',
       'Location'],
      dtype='object')

In [20]:
# infer data types
df.dtypes

ID                     float64
CaseNumber              object
Date                    object
Block                   object
IUCR                    object
PrimaryType             object
Description             object
LocationDescription     object
Arrest                    bool
Domestic                  bool
Beat                   float64
District               float64
Ward                   float64
CommunityArea          float64
FBICode                 object
XCoordinate            float64
YCoordinate            float64
Year                   float64
UpdatedOn               object
Latitude               float64
Longitude              float64
Location                object
dtype: object

In [21]:
def unique_column_values(df):
    for column in df.columns:
        print("{} | {} | {}".format(
            df[column].name,
            len(df[column].unique()),
            df[column].dtype))

In [22]:
%%time
# print unique column values counts
print("Name | Unique # | Type")
unique_column_values(df)

Name | Unique # | Type
ID | 130133 | float64
CaseNumber | 130105 | object
Date | 57788 | object
Block | 23684 | object
IUCR | 306 | object
PrimaryType | 32 | object
Description | 285 | object
LocationDescription | 120 | object
Arrest | 2 | bool
Domestic | 2 | bool
Beat | 274 | float64
District | 23 | float64
Ward | 51 | float64
CommunityArea | 78 | float64
FBICode | 26 | object
XCoordinate | 40181 | float64
YCoordinate | 51396 | float64
Year | 1 | float64
UpdatedOn | 337 | object
Latitude | 74941 | float64
Longitude | 74907 | float64
Location | 75026 | object
Wall time: 1.32 s


In [25]:
# reduce data set
select_columns = ['Date', 'Block', 'PrimaryType','Description', 'LocationDescription', 
                  'Arrest', 'Domestic', 'Year', 'Latitude', 'Longitude' ]

In [26]:
df = df[select_columns]
print("{:,} total records".format(len(df)))
df.head(2)

130,133 total records


Unnamed: 0,Date,Block,PrimaryType,Description,LocationDescription,Arrest,Domestic,Year,Latitude,Longitude
0,7/4/2017 23:50,005XX W PEARSON ST,BATTERY,SIMPLE,RESIDENTIAL YARD (FRONT/BACK),False,False,2017.0,41.897359,-87.64195
1,7/4/2017 23:46,053XX W HIRSCH ST,BATTERY,SIMPLE,SIDEWALK,False,False,2017.0,41.905835,-87.759273


In [27]:
# drop duplicates
df = df.drop_duplicates().dropna()
print("{:,} total records".format(len(df)))
df.head(2)

125,770 total records


Unnamed: 0,Date,Block,PrimaryType,Description,LocationDescription,Arrest,Domestic,Year,Latitude,Longitude
0,7/4/2017 23:50,005XX W PEARSON ST,BATTERY,SIMPLE,RESIDENTIAL YARD (FRONT/BACK),False,False,2017.0,41.897359,-87.64195
1,7/4/2017 23:46,053XX W HIRSCH ST,BATTERY,SIMPLE,SIDEWALK,False,False,2017.0,41.905835,-87.759273


In [29]:
# count arrests
arrests_df = df[df.Arrest==True]
arrests_df.Arrest.count()

dd.Scalar<series-..., dtype=int32>

In [30]:
#arrests_df.compute()
print("{:,} arrests".format(len(arrests_df)))
arrests_df.head(2)

24,832 arrests


Unnamed: 0,Date,Block,PrimaryType,Description,LocationDescription,Arrest,Domestic,Year,Latitude,Longitude
7,7/4/2017 23:43,005XX S PULASKI RD,CRIMINAL TRESPASS,TO STATE SUP LAND,CTA STATION,True,False,2017.0,41.873907,-87.72543
8,7/4/2017 23:42,049XX W RACE AVE,WEAPONS VIOLATION,UNLAWFUL POSS OF HANDGUN,SIDEWALK,True,False,2017.0,41.890448,-87.748911


In [31]:
# domestic violance
domestic_df = df[df.Domestic==True]
print("{:,} domestic violance reports".format(len(domestic_df)))
domestic_df.head()

20,949 domestic violance reports


Unnamed: 0,Date,Block,PrimaryType,Description,LocationDescription,Arrest,Domestic,Year,Latitude,Longitude
4,7/4/2017 23:45,027XX N HAMPDEN CT,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,2017.0,41.931373,-87.641884
9,7/4/2017 23:40,032XX N OPAL AVE,BATTERY,DOMESTIC BATTERY SIMPLE,STREET,False,True,2017.0,41.938426,-87.822651
10,7/4/2017 23:39,079XX S KINGSTON AVE,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,True,2017.0,41.751065,-87.562583
17,7/4/2017 23:31,070XX S PEORIA ST,THEFT,OVER $500,SIDEWALK,False,True,2017.0,41.766287,-87.646874
18,7/4/2017 23:30,081XX S MARSHFIELD AVE,BATTERY,AGGRAVATED DOMESTIC BATTERY: KNIFE/CUTTING INST,APARTMENT,False,True,2017.0,41.745891,-87.664538
