In [1]:
import dask.dataframe as dd

In [2]:
# load csv data into dask df
filename = '../data/Chicago-crimes-2017.csv'
df = dd.read_csv(filename, dtype='str')

In [37]:
# log records count and load data partitions
print("{:,} total records in {} partitions".format(len(df), df.npartitions))

125,770 total records in 1 partitions


In [6]:
# data size
df_size = df.size
df_size, type(df_size)

(dd.Scalar<size-ag..., dtype=int32>, dask.dataframe.core.Scalar)

In [17]:
%%time
df_size.compute()

Wall time: 1.19 s


2862926

In [39]:
%%time
# persist in memory
df = df.persist()
df.size.compute()

Wall time: 2.78 s


In [42]:
%%time
print("DataFrame size: {:,}".format(df.size.compute()))

DataFrame size: 1,257,700
Wall time: 15.6 ms


In [19]:
# get top 2 records
df.head(2)

Unnamed: 0,ID,CaseNumber,Date,Block,IUCR,PrimaryType,Description,LocationDescription,Arrest,Domestic,...,Ward,CommunityArea,FBICode,XCoordinate,YCoordinate,Year,UpdatedOn,Latitude,Longitude,Location
0,11007009,JA335947,7/4/2017 23:50,005XX W PEARSON ST,460,BATTERY,SIMPLE,RESIDENTIAL YARD (FRONT/BACK),False,False,...,27,8,08B,1172479,1905946,2017,7/11/2017 15:48,41.89735911,-87.64195001,"(41.897359112, -87.641950014)"
1,11005864,JA334658,7/4/2017 23:46,053XX W HIRSCH ST,460,BATTERY,SIMPLE,SIDEWALK,False,False,...,37,25,08B,1140517,1908799,2017,7/11/2017 15:48,41.90583487,-87.75927314,"(41.905834871, -87.759273141)"


In [4]:
# get last 2 records
df.tail(2)

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
130131,10802448,JA101652,1/1/2017 0:00,006XX W 62ND ST,1310,CRIMINAL DAMAGE,TO PROPERTY,RESIDENCE-GARAGE,False,False,...,16,68,14,1173000.0,1863775.0,2017,2/14/2017 15:49,41.78162679,-87.64128444,"(41.781626794, -87.641284443)"
130132,10802691,JA101951,1/1/2017 0:00,035XX N CLARK ST,890,THEFT,FROM BUILDING,BAR OR TAVERN,False,False,...,44,6,6,,,2017,1/8/2017 15:48,,,


In [10]:
# strip out white space from column names
df = df.rename(columns={c: c.replace(' ', '') for c in df.columns})
df.head(2)

Unnamed: 0,ID,CaseNumber,Date,Block,IUCR,PrimaryType,Description,LocationDescription,Arrest,Domestic,...,Ward,CommunityArea,FBICode,XCoordinate,YCoordinate,Year,UpdatedOn,Latitude,Longitude,Location
0,11007009,JA335947,7/4/2017 23:50,005XX W PEARSON ST,460,BATTERY,SIMPLE,RESIDENTIAL YARD (FRONT/BACK),False,False,...,27,8,08B,1172479,1905946,2017,7/11/2017 15:48,41.89735911,-87.64195001,"(41.897359112, -87.641950014)"
1,11005864,JA334658,7/4/2017 23:46,053XX W HIRSCH ST,460,BATTERY,SIMPLE,SIDEWALK,False,False,...,37,25,08B,1140517,1908799,2017,7/11/2017 15:48,41.90583487,-87.75927314,"(41.905834871, -87.759273141)"


In [6]:
# list columns
df.columns

Index(['ID', 'CaseNumber', 'Date', 'Block', 'IUCR', 'PrimaryType',
       'Description', 'LocationDescription', 'Arrest', 'Domestic', 'Beat',
       'District', 'Ward', 'CommunityArea', 'FBICode', 'XCoordinate',
       'YCoordinate', 'Year', 'UpdatedOn', 'Latitude', 'Longitude',
       'Location'],
      dtype='object')

In [8]:
# infer data types
df.dtypes

ID                     object
CaseNumber             object
Date                   object
Block                  object
IUCR                   object
PrimaryType            object
Description            object
LocationDescription    object
Arrest                 object
Domestic               object
Beat                   object
District               object
Ward                   object
CommunityArea          object
FBICode                object
XCoordinate            object
YCoordinate            object
Year                   object
UpdatedOn              object
Latitude               object
Longitude              object
Location               object
dtype: object

In [11]:
def unique_column_values(df):
    for column in df.columns:
        print("{} | {} | {}".format(
            df[column].name,
            len(df[column].unique()),
            df[column].dtype))

In [15]:
%%time
# print unique column values counts
print("Name | Unique # | Type")
unique_column_values(df)

Name | Unique # | Type
ID | 130133 | object
Case Number | 130105 | object
Date | 57788 | object
Block | 23684 | object
IUCR | 306 | object
Primary Type | 32 | object
Description | 285 | object
Location Description | 120 | object
Arrest | 2 | object
Domestic | 2 | object
Beat | 274 | object
District | 23 | object
Ward | 51 | object
Community Area | 78 | object
FBI Code | 26 | object
X Coordinate | 40181 | object
Y Coordinate | 51396 | object
Year | 1 | object
Updated On | 337 | object
Latitude | 74941 | object
Longitude | 74907 | object
Location | 75026 | object
Wall time: 23.4 s


In [20]:
# reduce data set
select_columns = ['Date', 'Block', 'Primary Type','Description', 'Location Description', 
                  'Arrest', 'Domestic', 'Year', 'Latitude', 'Longitude' ]

In [28]:
df = df[select_columns]
print("{:,} total records".format(len(df)))
df.head(2)

130,133 total records


Unnamed: 0,Date,Block,Primary Type,Description,Location Description,Arrest,Domestic,Year,Latitude,Longitude
0,7/4/2017 23:50,005XX W PEARSON ST,BATTERY,SIMPLE,RESIDENTIAL YARD (FRONT/BACK),False,False,2017,41.89735911,-87.64195001
1,7/4/2017 23:46,053XX W HIRSCH ST,BATTERY,SIMPLE,SIDEWALK,False,False,2017,41.90583487,-87.75927314


In [31]:
# drop duplicates
df = df.drop_duplicates().dropna()
print("{:,} total records".format(len(df)))
df.head(2)

125,770 total records


Unnamed: 0,Date,Block,Primary Type,Description,Location Description,Arrest,Domestic,Year,Latitude,Longitude
0,7/4/2017 23:50,005XX W PEARSON ST,BATTERY,SIMPLE,RESIDENTIAL YARD (FRONT/BACK),False,False,2017,41.89735911,-87.64195001
1,7/4/2017 23:46,053XX W HIRSCH ST,BATTERY,SIMPLE,SIDEWALK,False,False,2017,41.90583487,-87.75927314


In [32]:
# count arrests
arrests_df = df[df.Arrest=='TRUE']
arrests_df.Arrest.count()

dd.Scalar<series-..., dtype=int32>

In [35]:
#arrests_df.compute()
print("{:,} arrests".format(len(arrests_df)))
arrests_df.head(2)

24,832 arrests


Unnamed: 0,Date,Block,Primary Type,Description,Location Description,Arrest,Domestic,Year,Latitude,Longitude
7,7/4/2017 23:43,005XX S PULASKI RD,CRIMINAL TRESPASS,TO STATE SUP LAND,CTA STATION,True,False,2017,41.87390705,-87.72543009
8,7/4/2017 23:42,049XX W RACE AVE,WEAPONS VIOLATION,UNLAWFUL POSS OF HANDGUN,SIDEWALK,True,False,2017,41.89044778,-87.74891071


In [43]:
# domestic violance
domestic_df = df[df.Domestic=='TRUE']
print("{:,} domestic violance reports".format(len(domestic_df)))
domestic_df.head()

20,949 domestic violance reports


Unnamed: 0,Date,Block,Primary Type,Description,Location Description,Arrest,Domestic,Year,Latitude,Longitude
4,7/4/2017 23:45,027XX N HAMPDEN CT,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,2017,41.93137342,-87.64188416
9,7/4/2017 23:40,032XX N OPAL AVE,BATTERY,DOMESTIC BATTERY SIMPLE,STREET,False,True,2017,41.93842561,-87.82265066
10,7/4/2017 23:39,079XX S KINGSTON AVE,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,True,2017,41.75106476,-87.56258312
17,7/4/2017 23:31,070XX S PEORIA ST,THEFT,OVER $500,SIDEWALK,False,True,2017,41.7662868,-87.6468744
18,7/4/2017 23:30,081XX S MARSHFIELD AVE,BATTERY,AGGRAVATED DOMESTIC BATTERY: KNIFE/CUTTING INST,APARTMENT,False,True,2017,41.74589103,-87.66453806
