##Chicago Crime Data Analysis
This project is based on Chicago crime data downloaded from [Chicago Data Portal](https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-Present/ijzp-q8t2). The data set include the crime in Chicago from 2001 to present

In [0]:
%fs ls /mnt/isa460/data/chicago_crime

path,name,size
dbfs:/mnt/isa460/data/chicago_crime/Crimes2001_2021.csv,Crimes2001_2021.csv,1952853549
dbfs:/mnt/isa460/data/chicago_crime/parquet/,parquet/,0
dbfs:/mnt/isa460/data/chicago_crime/policestation.csv,policestation.csv,5723


In [0]:
# import csv data from S3 storage folder

df=spark.read.csv("/mnt/isa460/data/chicago_crime/Crimes2001_2021.csv", header=True, inferSchema=True)


In [0]:
df.printSchema()

In [0]:
display(df)

ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location,Historical Wards 2003-2015,Zip Codes,Community Areas,Census Tracts,Wards,Boundaries - ZIP Codes,Police Districts,Police Beats
10224738,HY411648,09/05/2015 01:30:00 PM,043XX S WOOD ST,0486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,True,924,9,12,61,08B,1165074.0,1875917.0,2015,02/10/2018 03:50:01 PM,41.815117282,-87.669999562,"(41.815117282, -87.669999562)",29.0,14924.0,59.0,706.0,3.0,37.0,23.0,108.0
10224739,HY411615,09/04/2015 11:30:00 AM,008XX N CENTRAL AVE,0870,THEFT,POCKET-PICKING,CTA BUS,False,False,1511,15,29,25,06,1138875.0,1904869.0,2015,02/10/2018 03:50:01 PM,41.895080471,-87.765400451,"(41.895080471, -87.765400451)",4.0,4299.0,26.0,562.0,45.0,5.0,25.0,67.0
11646166,JC213529,09/01/2018 12:01:00 AM,082XX S INGLESIDE AVE,0810,THEFT,OVER $500,RESIDENCE,False,True,631,6,8,44,06,,,2018,04/06/2019 04:04:43 PM,,,,,,,,,,,
10224740,HY411595,09/05/2015 12:45:00 PM,035XX W BARRY AVE,2023,NARCOTICS,POSS: HEROIN(BRN/TAN),SIDEWALK,True,False,1412,14,35,21,18,1152037.0,1920384.0,2015,02/10/2018 03:50:01 PM,41.937405765,-87.716649687,"(41.937405765, -87.716649687)",15.0,21538.0,22.0,216.0,12.0,39.0,7.0,168.0
10224741,HY411610,09/05/2015 01:00:00 PM,0000X N LARAMIE AVE,0560,ASSAULT,SIMPLE,APARTMENT,False,True,1522,15,28,25,08A,1141706.0,1900086.0,2015,02/10/2018 03:50:01 PM,41.881903443,-87.755121152,"(41.881903443, -87.755121152)",11.0,22216.0,26.0,696.0,23.0,32.0,25.0,81.0
10224742,HY411435,09/05/2015 10:55:00 AM,082XX S LOOMIS BLVD,0610,BURGLARY,FORCIBLE ENTRY,RESIDENCE,False,False,614,6,21,71,05,1168430.0,1850165.0,2015,02/10/2018 03:50:01 PM,41.744378879,-87.658430635,"(41.744378879, -87.658430635)",18.0,21554.0,70.0,575.0,13.0,59.0,20.0,237.0
10224743,HY411629,09/04/2015 06:00:00 PM,021XX W CHURCHILL ST,0620,BURGLARY,UNLAWFUL ENTRY,RESIDENCE-GARAGE,False,False,1434,14,32,24,05,1161628.0,1912157.0,2015,02/10/2018 03:50:01 PM,41.914635603,-87.681630909,"(41.914635603, -87.681630909)",16.0,22535.0,25.0,179.0,40.0,1.0,7.0,192.0
10224744,HY411605,09/05/2015 01:00:00 PM,025XX W CERMAK RD,0860,THEFT,RETAIL THEFT,GROCERY FOOD STORE,True,False,1034,10,25,31,06,1159734.0,1889313.0,2015,09/17/2015 11:37:18 AM,41.851988885,-87.689219118,"(41.851988885, -87.689219118)",11.0,14920.0,32.0,203.0,23.0,43.0,21.0,151.0
10224745,HY411654,09/05/2015 11:30:00 AM,031XX W WASHINGTON BLVD,0320,ROBBERY,STRONGARM - NO WEAPON,STREET,False,True,1222,12,27,27,03,1155536.0,1900515.0,2015,02/10/2018 03:50:01 PM,41.88281374,-87.704325717,"(41.88281374, -87.704325717)",41.0,21184.0,28.0,50.0,46.0,28.0,15.0,77.0
11645836,JC212333,05/01/2016 12:25:00 AM,055XX S ROCKWELL ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,824,8,15,63,11,,,2016,04/06/2019 04:04:43 PM,,,,,,,,,,,


In [0]:
df.count()

##ETL 

### Replace space in column names as _, change the names to all lower cases letters.

In [0]:
from pyspark.sql.functions import *

columns=df.columns

for col in columns:
  new_col=col.replace(" ", "_").lower()
  df=df.withColumnRenamed(col, new_col)


###Change date from string type to timestamp type

In [0]:
from pyspark.sql.functions import to_timestamp
help(to_timestamp)

In [0]:
from pyspark.sql.functions import to_timestamp, col
crime_df = df.withColumn('Date',to_timestamp(col('Date'),'MM/dd/yyyy hh:mm:ss a'))
crime_df.printSchema()

In [0]:
display(crime_df)

id,case_number,Date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location,historical_wards_2003-2015,zip_codes,community_areas,census_tracts,wards,boundaries_-_zip_codes,police_districts,police_beats
10224738,HY411648,2015-09-05T13:30:00.000+0000,043XX S WOOD ST,0486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,True,924,9,12,61,08B,1165074.0,1875917.0,2015,02/10/2018 03:50:01 PM,41.815117282,-87.669999562,"(41.815117282, -87.669999562)",29.0,14924.0,59.0,706.0,3.0,37.0,23.0,108.0
10224739,HY411615,2015-09-04T11:30:00.000+0000,008XX N CENTRAL AVE,0870,THEFT,POCKET-PICKING,CTA BUS,False,False,1511,15,29,25,06,1138875.0,1904869.0,2015,02/10/2018 03:50:01 PM,41.895080471,-87.765400451,"(41.895080471, -87.765400451)",4.0,4299.0,26.0,562.0,45.0,5.0,25.0,67.0
11646166,JC213529,2018-09-01T00:01:00.000+0000,082XX S INGLESIDE AVE,0810,THEFT,OVER $500,RESIDENCE,False,True,631,6,8,44,06,,,2018,04/06/2019 04:04:43 PM,,,,,,,,,,,
10224740,HY411595,2015-09-05T12:45:00.000+0000,035XX W BARRY AVE,2023,NARCOTICS,POSS: HEROIN(BRN/TAN),SIDEWALK,True,False,1412,14,35,21,18,1152037.0,1920384.0,2015,02/10/2018 03:50:01 PM,41.937405765,-87.716649687,"(41.937405765, -87.716649687)",15.0,21538.0,22.0,216.0,12.0,39.0,7.0,168.0
10224741,HY411610,2015-09-05T13:00:00.000+0000,0000X N LARAMIE AVE,0560,ASSAULT,SIMPLE,APARTMENT,False,True,1522,15,28,25,08A,1141706.0,1900086.0,2015,02/10/2018 03:50:01 PM,41.881903443,-87.755121152,"(41.881903443, -87.755121152)",11.0,22216.0,26.0,696.0,23.0,32.0,25.0,81.0
10224742,HY411435,2015-09-05T10:55:00.000+0000,082XX S LOOMIS BLVD,0610,BURGLARY,FORCIBLE ENTRY,RESIDENCE,False,False,614,6,21,71,05,1168430.0,1850165.0,2015,02/10/2018 03:50:01 PM,41.744378879,-87.658430635,"(41.744378879, -87.658430635)",18.0,21554.0,70.0,575.0,13.0,59.0,20.0,237.0
10224743,HY411629,2015-09-04T18:00:00.000+0000,021XX W CHURCHILL ST,0620,BURGLARY,UNLAWFUL ENTRY,RESIDENCE-GARAGE,False,False,1434,14,32,24,05,1161628.0,1912157.0,2015,02/10/2018 03:50:01 PM,41.914635603,-87.681630909,"(41.914635603, -87.681630909)",16.0,22535.0,25.0,179.0,40.0,1.0,7.0,192.0
10224744,HY411605,2015-09-05T13:00:00.000+0000,025XX W CERMAK RD,0860,THEFT,RETAIL THEFT,GROCERY FOOD STORE,True,False,1034,10,25,31,06,1159734.0,1889313.0,2015,09/17/2015 11:37:18 AM,41.851988885,-87.689219118,"(41.851988885, -87.689219118)",11.0,14920.0,32.0,203.0,23.0,43.0,21.0,151.0
10224745,HY411654,2015-09-05T11:30:00.000+0000,031XX W WASHINGTON BLVD,0320,ROBBERY,STRONGARM - NO WEAPON,STREET,False,True,1222,12,27,27,03,1155536.0,1900515.0,2015,02/10/2018 03:50:01 PM,41.88281374,-87.704325717,"(41.88281374, -87.704325717)",41.0,21184.0,28.0,50.0,46.0,28.0,15.0,77.0
11645836,JC212333,2016-05-01T00:25:00.000+0000,055XX S ROCKWELL ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,824,8,15,63,11,,,2016,04/06/2019 04:04:43 PM,,,,,,,,,,,


### store transformed crime data as parquet data format

In [0]:
crime_df.write.parquet("/mnt/isa460/data/chicago_crime/parquet")

check the stored parquet file

In [0]:
%fs ls /mnt/isa460/data/chicago_crime/parquet

path,name,size
dbfs:/mnt/isa460/data/chicago_crime/parquet/_SUCCESS,_SUCCESS,0
dbfs:/mnt/isa460/data/chicago_crime/parquet/_committed_1293081524776846885,_committed_1293081524776846885,1608
dbfs:/mnt/isa460/data/chicago_crime/parquet/_started_1293081524776846885,_started_1293081524776846885,0
dbfs:/mnt/isa460/data/chicago_crime/parquet/part-00000-tid-1293081524776846885-ef3d1cc3-fbcc-4147-b491-5a11354b2093-18-1-c000.snappy.parquet,part-00000-tid-1293081524776846885-ef3d1cc3-fbcc-4147-b491-5a11354b2093-18-1-c000.snappy.parquet,32041452
dbfs:/mnt/isa460/data/chicago_crime/parquet/part-00001-tid-1293081524776846885-ef3d1cc3-fbcc-4147-b491-5a11354b2093-19-1-c000.snappy.parquet,part-00001-tid-1293081524776846885-ef3d1cc3-fbcc-4147-b491-5a11354b2093-19-1-c000.snappy.parquet,31383868
dbfs:/mnt/isa460/data/chicago_crime/parquet/part-00002-tid-1293081524776846885-ef3d1cc3-fbcc-4147-b491-5a11354b2093-20-1-c000.snappy.parquet,part-00002-tid-1293081524776846885-ef3d1cc3-fbcc-4147-b491-5a11354b2093-20-1-c000.snappy.parquet,33145401
dbfs:/mnt/isa460/data/chicago_crime/parquet/part-00003-tid-1293081524776846885-ef3d1cc3-fbcc-4147-b491-5a11354b2093-21-1-c000.snappy.parquet,part-00003-tid-1293081524776846885-ef3d1cc3-fbcc-4147-b491-5a11354b2093-21-1-c000.snappy.parquet,33492057
dbfs:/mnt/isa460/data/chicago_crime/parquet/part-00004-tid-1293081524776846885-ef3d1cc3-fbcc-4147-b491-5a11354b2093-22-1-c000.snappy.parquet,part-00004-tid-1293081524776846885-ef3d1cc3-fbcc-4147-b491-5a11354b2093-22-1-c000.snappy.parquet,32638459
dbfs:/mnt/isa460/data/chicago_crime/parquet/part-00005-tid-1293081524776846885-ef3d1cc3-fbcc-4147-b491-5a11354b2093-23-1-c000.snappy.parquet,part-00005-tid-1293081524776846885-ef3d1cc3-fbcc-4147-b491-5a11354b2093-23-1-c000.snappy.parquet,32509281
dbfs:/mnt/isa460/data/chicago_crime/parquet/part-00006-tid-1293081524776846885-ef3d1cc3-fbcc-4147-b491-5a11354b2093-24-1-c000.snappy.parquet,part-00006-tid-1293081524776846885-ef3d1cc3-fbcc-4147-b491-5a11354b2093-24-1-c000.snappy.parquet,32547814


# Basic Analysis

In [0]:
# load crime data (in parquet format)

crime_df=spark.read.parquet("/mnt/isa460/data/chicago_crime/parquet")

## number of crimes by year

## number of crimes by year and by month

###What are the top 10 number of reported crimes by primary type, in descending order of occurence?

### What are the top 5 reported crimes by primary type for each year?

###Find the percentage of reported crimes that results in an arrest

### Find the percentage of reported crimes that results in arrest by year.

### What are the top 10 words appearing in the deacription of the crime?

#Working with joins

In [0]:
#The reported crimes dataset has only the district number. Add the district name by joining with the police station dataset.

# load the policy dataset, only keep District and District Name

station_df = spark.read.csv('/mnt/isa460/data/chicago_crime/policestation.csv',header=True, inferSchema=True)
station_df=station_df.select('District', "District Name")
station_df.printSchema()

### join police station data with crime data

### Which district has the highest arrest rate?

### Create widget based on primary type

In [0]:
dbutils.widgets.removeAll()

In [0]:
new_df.createOrReplaceTempView('crime')

In [0]:
primary_type = spark.sql("select distinct primary_type from crime").rdd.map(lambda row : row[0]).collect()
primary_type.sort()

In [0]:
dbutils.widgets.dropdown("Type", "THEFT", [str(x) for x in primary_type])

Find the day of the week with the most reported crime by certain primary type

In [0]:
display(crime_df.filter(col('primary_type')==getArgument("Type")).groupBy(date_format('date','E').alias('week day')).count().orderBy('count', ascending=False))

week day,count
Fri,41410
Sat,41293
Mon,39842
Sun,39531
Tue,38972
Wed,38814
Thu,38433
