#Chicago Crime Data Analysis
This project is based on Chicago crime data downloaded from [Chicago Data Portal](https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-Present/ijzp-q8t2). This link inclues all crime in Chicago since 2001.

This data was downloaded from the site and save it as a parquet format. This analysis will focus on crimes between 2012 and 2022.

Download the dataset(ChicagoCrime2012_2022.parquet, PoliceStation.csv) from Canvas and store them in a folder /FileStore/tables/ChicagoCrime

In [0]:
%fs ls /FileStore/tables/ChicagoCrime

path,name,size,modificationTime
dbfs:/FileStore/tables/ChicagoCrime/ChicagoCrimes2012_2022.parquet,ChicagoCrimes2012_2022.parquet,183515898,1664232925000
dbfs:/FileStore/tables/ChicagoCrime/PoliceStation.csv,PoliceStation.csv,5723,1664233229000


### Import crime data

In [0]:
df=spark.read.parquet("/FileStore/tables/ChicagoCrime/ChicagoCrimes2012_2022.parquet")


In [0]:
df.count()

Out[2]: 7634737

In [0]:
display(df)

ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
10224738,HY411648,09/05/2015 01:30:00 PM,043XX S WOOD ST,0486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,True,924,9,12,61,08B,1165074.0,1875917.0,2015,02/10/2018 03:50:01 PM,41.815117282,-87.669999562,"(41.815117282, -87.669999562)"
10224739,HY411615,09/04/2015 11:30:00 AM,008XX N CENTRAL AVE,0870,THEFT,POCKET-PICKING,CTA BUS,False,False,1511,15,29,25,06,1138875.0,1904869.0,2015,02/10/2018 03:50:01 PM,41.895080471,-87.765400451,"(41.895080471, -87.765400451)"
11646166,JC213529,09/01/2018 12:01:00 AM,082XX S INGLESIDE AVE,0810,THEFT,OVER $500,RESIDENCE,False,True,631,6,8,44,06,,,2018,04/06/2019 04:04:43 PM,,,
10224740,HY411595,09/05/2015 12:45:00 PM,035XX W BARRY AVE,2023,NARCOTICS,POSS: HEROIN(BRN/TAN),SIDEWALK,True,False,1412,14,35,21,18,1152037.0,1920384.0,2015,02/10/2018 03:50:01 PM,41.937405765,-87.716649687,"(41.937405765, -87.716649687)"
10224741,HY411610,09/05/2015 01:00:00 PM,0000X N LARAMIE AVE,0560,ASSAULT,SIMPLE,APARTMENT,False,True,1522,15,28,25,08A,1141706.0,1900086.0,2015,02/10/2018 03:50:01 PM,41.881903443,-87.755121152,"(41.881903443, -87.755121152)"
10224742,HY411435,09/05/2015 10:55:00 AM,082XX S LOOMIS BLVD,0610,BURGLARY,FORCIBLE ENTRY,RESIDENCE,False,False,614,6,21,71,05,1168430.0,1850165.0,2015,02/10/2018 03:50:01 PM,41.744378879,-87.658430635,"(41.744378879, -87.658430635)"
10224743,HY411629,09/04/2015 06:00:00 PM,021XX W CHURCHILL ST,0620,BURGLARY,UNLAWFUL ENTRY,RESIDENCE-GARAGE,False,False,1434,14,32,24,05,1161628.0,1912157.0,2015,02/10/2018 03:50:01 PM,41.914635603,-87.681630909,"(41.914635603, -87.681630909)"
10224744,HY411605,09/05/2015 01:00:00 PM,025XX W CERMAK RD,0860,THEFT,RETAIL THEFT,GROCERY FOOD STORE,True,False,1034,10,25,31,06,1159734.0,1889313.0,2015,09/17/2015 11:37:18 AM,41.851988885,-87.689219118,"(41.851988885, -87.689219118)"
10224745,HY411654,09/05/2015 11:30:00 AM,031XX W WASHINGTON BLVD,0320,ROBBERY,STRONGARM - NO WEAPON,STREET,False,True,1222,12,27,27,03,1155536.0,1900515.0,2015,02/10/2018 03:50:01 PM,41.88281374,-87.704325717,"(41.88281374, -87.704325717)"
11645836,JC212333,05/01/2016 12:25:00 AM,055XX S ROCKWELL ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,824,8,15,63,11,,,2016,04/06/2019 04:04:43 PM,,,


In [0]:
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Case Number: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: boolean (nullable = true)
 |-- Domestic: boolean (nullable = true)
 |-- Beat: integer (nullable = true)
 |-- District: integer (nullable = true)
 |-- Ward: integer (nullable = true)
 |-- Community Area: integer (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- X Coordinate: integer (nullable = true)
 |-- Y Coordinate: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Location: string (nullable = true)



##ETL

### Column name reformat
Replace space in column names as _, change the names to all lower cases letters.

In [0]:
from pyspark.sql.functions import col

columns=df.columns

for col in columns:
  new_col=col.replace(" ", "_").lower()
  df=df.withColumnRenamed(col, new_col)
  
display(df)

id,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
10224738,HY411648,09/05/2015 01:30:00 PM,043XX S WOOD ST,0486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,True,924,9,12,61,08B,1165074.0,1875917.0,2015,02/10/2018 03:50:01 PM,41.815117282,-87.669999562,"(41.815117282, -87.669999562)"
10224739,HY411615,09/04/2015 11:30:00 AM,008XX N CENTRAL AVE,0870,THEFT,POCKET-PICKING,CTA BUS,False,False,1511,15,29,25,06,1138875.0,1904869.0,2015,02/10/2018 03:50:01 PM,41.895080471,-87.765400451,"(41.895080471, -87.765400451)"
11646166,JC213529,09/01/2018 12:01:00 AM,082XX S INGLESIDE AVE,0810,THEFT,OVER $500,RESIDENCE,False,True,631,6,8,44,06,,,2018,04/06/2019 04:04:43 PM,,,
10224740,HY411595,09/05/2015 12:45:00 PM,035XX W BARRY AVE,2023,NARCOTICS,POSS: HEROIN(BRN/TAN),SIDEWALK,True,False,1412,14,35,21,18,1152037.0,1920384.0,2015,02/10/2018 03:50:01 PM,41.937405765,-87.716649687,"(41.937405765, -87.716649687)"
10224741,HY411610,09/05/2015 01:00:00 PM,0000X N LARAMIE AVE,0560,ASSAULT,SIMPLE,APARTMENT,False,True,1522,15,28,25,08A,1141706.0,1900086.0,2015,02/10/2018 03:50:01 PM,41.881903443,-87.755121152,"(41.881903443, -87.755121152)"
10224742,HY411435,09/05/2015 10:55:00 AM,082XX S LOOMIS BLVD,0610,BURGLARY,FORCIBLE ENTRY,RESIDENCE,False,False,614,6,21,71,05,1168430.0,1850165.0,2015,02/10/2018 03:50:01 PM,41.744378879,-87.658430635,"(41.744378879, -87.658430635)"
10224743,HY411629,09/04/2015 06:00:00 PM,021XX W CHURCHILL ST,0620,BURGLARY,UNLAWFUL ENTRY,RESIDENCE-GARAGE,False,False,1434,14,32,24,05,1161628.0,1912157.0,2015,02/10/2018 03:50:01 PM,41.914635603,-87.681630909,"(41.914635603, -87.681630909)"
10224744,HY411605,09/05/2015 01:00:00 PM,025XX W CERMAK RD,0860,THEFT,RETAIL THEFT,GROCERY FOOD STORE,True,False,1034,10,25,31,06,1159734.0,1889313.0,2015,09/17/2015 11:37:18 AM,41.851988885,-87.689219118,"(41.851988885, -87.689219118)"
10224745,HY411654,09/05/2015 11:30:00 AM,031XX W WASHINGTON BLVD,0320,ROBBERY,STRONGARM - NO WEAPON,STREET,False,True,1222,12,27,27,03,1155536.0,1900515.0,2015,02/10/2018 03:50:01 PM,41.88281374,-87.704325717,"(41.88281374, -87.704325717)"
11645836,JC212333,05/01/2016 12:25:00 AM,055XX S ROCKWELL ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,824,8,15,63,11,,,2016,04/06/2019 04:04:43 PM,,,


Modify format of column

###Change date from string type to timestamp type

In [0]:
from pyspark.sql.functions import to_timestamp

help(to_timestamp)

Help on function to_timestamp in module pyspark.sql.functions:

to_timestamp(col, format=None)
    Converts a :class:`~pyspark.sql.Column` into :class:`pyspark.sql.types.TimestampType`
    using the optionally specified format. Specify formats according to `datetime pattern`_.
    By default, it follows casting rules to :class:`pyspark.sql.types.TimestampType` if the format
    is omitted. Equivalent to ``col.cast("timestamp")``.
    
    .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
    
    .. versionadded:: 2.2.0
    
    Examples
    --------
    >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
    >>> df.select(to_timestamp(df.t).alias('dt')).collect()
    [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]
    
    >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
    >>> df.select(to_timestamp(df.t, 'yyyy-MM-dd HH:mm:ss').alias('dt')).collect()
    [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]



In [0]:
from pyspark.sql.functions import to_timestamp, col
crime_df = df.withColumn('date',to_timestamp(col('Date'),'MM/dd/yyyy hh:mm:ss a'))
crime_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- case_number: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- block: string (nullable = true)
 |-- iucr: string (nullable = true)
 |-- primary_type: string (nullable = true)
 |-- description: string (nullable = true)
 |-- location_description: string (nullable = true)
 |-- arrest: boolean (nullable = true)
 |-- domestic: boolean (nullable = true)
 |-- beat: integer (nullable = true)
 |-- district: integer (nullable = true)
 |-- ward: integer (nullable = true)
 |-- community_area: integer (nullable = true)
 |-- fbi_code: string (nullable = true)
 |-- x_coordinate: integer (nullable = true)
 |-- y_coordinate: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- updated_on: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- location: string (nullable = true)



In [0]:
display(crime_df)

id,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
10224738,HY411648,2015-09-05T13:30:00.000+0000,043XX S WOOD ST,0486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,True,924,9,12,61,08B,1165074.0,1875917.0,2015,02/10/2018 03:50:01 PM,41.815117282,-87.669999562,"(41.815117282, -87.669999562)"
10224739,HY411615,2015-09-04T11:30:00.000+0000,008XX N CENTRAL AVE,0870,THEFT,POCKET-PICKING,CTA BUS,False,False,1511,15,29,25,06,1138875.0,1904869.0,2015,02/10/2018 03:50:01 PM,41.895080471,-87.765400451,"(41.895080471, -87.765400451)"
11646166,JC213529,2018-09-01T00:01:00.000+0000,082XX S INGLESIDE AVE,0810,THEFT,OVER $500,RESIDENCE,False,True,631,6,8,44,06,,,2018,04/06/2019 04:04:43 PM,,,
10224740,HY411595,2015-09-05T12:45:00.000+0000,035XX W BARRY AVE,2023,NARCOTICS,POSS: HEROIN(BRN/TAN),SIDEWALK,True,False,1412,14,35,21,18,1152037.0,1920384.0,2015,02/10/2018 03:50:01 PM,41.937405765,-87.716649687,"(41.937405765, -87.716649687)"
10224741,HY411610,2015-09-05T13:00:00.000+0000,0000X N LARAMIE AVE,0560,ASSAULT,SIMPLE,APARTMENT,False,True,1522,15,28,25,08A,1141706.0,1900086.0,2015,02/10/2018 03:50:01 PM,41.881903443,-87.755121152,"(41.881903443, -87.755121152)"
10224742,HY411435,2015-09-05T10:55:00.000+0000,082XX S LOOMIS BLVD,0610,BURGLARY,FORCIBLE ENTRY,RESIDENCE,False,False,614,6,21,71,05,1168430.0,1850165.0,2015,02/10/2018 03:50:01 PM,41.744378879,-87.658430635,"(41.744378879, -87.658430635)"
10224743,HY411629,2015-09-04T18:00:00.000+0000,021XX W CHURCHILL ST,0620,BURGLARY,UNLAWFUL ENTRY,RESIDENCE-GARAGE,False,False,1434,14,32,24,05,1161628.0,1912157.0,2015,02/10/2018 03:50:01 PM,41.914635603,-87.681630909,"(41.914635603, -87.681630909)"
10224744,HY411605,2015-09-05T13:00:00.000+0000,025XX W CERMAK RD,0860,THEFT,RETAIL THEFT,GROCERY FOOD STORE,True,False,1034,10,25,31,06,1159734.0,1889313.0,2015,09/17/2015 11:37:18 AM,41.851988885,-87.689219118,"(41.851988885, -87.689219118)"
10224745,HY411654,2015-09-05T11:30:00.000+0000,031XX W WASHINGTON BLVD,0320,ROBBERY,STRONGARM - NO WEAPON,STREET,False,True,1222,12,27,27,03,1155536.0,1900515.0,2015,02/10/2018 03:50:01 PM,41.88281374,-87.704325717,"(41.88281374, -87.704325717)"
11645836,JC212333,2016-05-01T00:25:00.000+0000,055XX S ROCKWELL ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,824,8,15,63,11,,,2016,04/06/2019 04:04:43 PM,,,


# Basic Analysis

### number of crimes by year

In [0]:
display(crime_df.groupBy("year").count().orderBy("year"))

year,count
2012,336247
2013,307435
2014,275708
2015,264696
2016,269744
2017,268979
2018,268668
2019,261119
2020,211872
2021,207862


### number of crimes by year and by month

In [0]:
from pyspark.sql.functions import month

display(crime_df.filter(year('date')>=2015).groupBy(year('date').alias("Year"), month('date').alias("Month")).count().orderBy("Year", "Month"))

Year,Month,count
2015,1,20957
2015,2,16401
2015,3,21684
2015,4,21729
2015,5,23708
2015,6,23191
2015,7,24234
2015,8,24822
2015,9,23130
2015,10,23100


### What are the top 10 number of reported crimes by primary type, in descending order of occurence?

In [0]:
display(crime_df.groupBy('primary_type').count().sort(col('count').desc()).limit(10))

primary_type,count
THEFT,639592
BATTERY,522098
CRIMINAL DAMAGE,306333
ASSAULT,204424
NARCOTICS,191729
DECEPTIVE PRACTICE,182875
OTHER OFFENSE,175689
BURGLARY,137808
MOTOR VEHICLE THEFT,122697
ROBBERY,107957


### What are the top 5 reported crimes by primary type for each year?

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, desc

display(crime_df.groupBy(year('date').alias("year"), 'primary_type').count().withColumn("rank", rank().over(Window.partitionBy("year").orderBy(desc("count")))).filter(col("rank")<=5))

year,primary_type,count,rank
2012,THEFT,75465,1
2012,BATTERY,59135,2
2012,CRIMINAL DAMAGE,35855,3
2012,NARCOTICS,35489,4
2012,BURGLARY,22845,5
2013,THEFT,71533,1
2013,BATTERY,54005,2
2013,NARCOTICS,34127,3
2013,CRIMINAL DAMAGE,30854,4
2013,OTHER OFFENSE,17998,5


###Find the percentage of reported crimes that results in an arrest

In [0]:
crime_df.filter(col('arrest')=='true').count()/crime_df.count()

Out[38]: 0.21916611469802555

### find the percentage of reported crimes that results in arrest by year.

In [0]:
from pyspark.sql.functions import col, count

display(crime_df.groupBy('year').pivot("arrest").count().orderBy('year').withColumn("percent_arrest", col('true')/(col('false')+col('true'))))


year,false,true,percent_arrest
2012,245591,90656,0.2696113273873075
2013,220904,86531,0.2814611218631581
2014,196089,79619,0.2887801587186444
2015,194670,70026,0.2645525432949497
2016,216729,53015,0.1965381991814461
2017,216350,52629,0.1956621148862922
2018,214828,53840,0.2003960278112763
2019,204948,56171,0.2151164794595567
2020,177839,34033,0.1606300030206917
2021,182023,25839,0.1243084354042586


In [0]:
import pandas as pd

crimeByYear=crime_df.groupBy('year').agg(count("year").alias("crime_count")).orderBy("year").toPandas()
arrestByYear=crime_df.filter(col("arrest")=="true").groupBy('year').agg(count("year").alias("arrest_count")).orderBy("year").select("arrest_count").toPandas()

crime=pd.concat([crimeByYear, arrestByYear], axis=1)
crime['arrest_rate']=crime['arrest_count']/crime['crime_count']

display(spark.createDataFrame(crime))

year,crime_count,arrest_count,arrest_rate
2012,336247,90656,0.2696113273873075
2013,307435,86531,0.2814611218631581
2014,275708,79619,0.2887801587186444
2015,264696,70026,0.2645525432949497
2016,269744,53015,0.1965381991814461
2017,268979,52629,0.1956621148862922
2018,268668,53840,0.2003960278112763
2019,261119,56171,0.2151164794595567
2020,211872,34033,0.1606300030206917
2021,207862,25839,0.1243084354042586


### What are the top 10 words appearing in the deacription of the crime?

In [0]:
from pyspark.sql.functions import *
crime_df.withColumn("word", explode(split("description", " "))).groupBy("word").count().orderBy(desc("count")).show()

#Working with joins

In [0]:
#The reported crimes dataset has only the district number. Add the district name by joining with the police station dataset.

# load the policy dataset, only keep District and District Name

station_df = spark.read.csv('/FileStore/tables/ChicagoCrime/PoliceStation.csv',header=True, inferSchema=True)
station_df=station_df.select('District', "District Name")
station_df.printSchema()

root
 |-- District: string (nullable = true)
 |-- District Name: string (nullable = true)



In [0]:
#Join police staion with crime data.

#new_df=crime_df.join(station_df, crime_df['district']==station_df['district'], 'inner')

# if both fields have the same name, the following join can be used:

new_df=crime_df.join(station_df, 'district', 'inner')

In [0]:
display(new_df)

district,id,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location,District Name
9,10224738,HY411648,2015-09-05T13:30:00.000+0000,043XX S WOOD ST,0486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,True,924,12,61,08B,1165074.0,1875917.0,2015,02/10/2018 03:50:01 PM,41.815117282,-87.669999562,"(41.815117282, -87.669999562)",Deering
15,10224739,HY411615,2015-09-04T11:30:00.000+0000,008XX N CENTRAL AVE,0870,THEFT,POCKET-PICKING,CTA BUS,False,False,1511,29,25,06,1138875.0,1904869.0,2015,02/10/2018 03:50:01 PM,41.895080471,-87.765400451,"(41.895080471, -87.765400451)",Austin
6,11646166,JC213529,2018-09-01T00:01:00.000+0000,082XX S INGLESIDE AVE,0810,THEFT,OVER $500,RESIDENCE,False,True,631,8,44,06,,,2018,04/06/2019 04:04:43 PM,,,,Gresham
14,10224740,HY411595,2015-09-05T12:45:00.000+0000,035XX W BARRY AVE,2023,NARCOTICS,POSS: HEROIN(BRN/TAN),SIDEWALK,True,False,1412,35,21,18,1152037.0,1920384.0,2015,02/10/2018 03:50:01 PM,41.937405765,-87.716649687,"(41.937405765, -87.716649687)",Shakespeare
15,10224741,HY411610,2015-09-05T13:00:00.000+0000,0000X N LARAMIE AVE,0560,ASSAULT,SIMPLE,APARTMENT,False,True,1522,28,25,08A,1141706.0,1900086.0,2015,02/10/2018 03:50:01 PM,41.881903443,-87.755121152,"(41.881903443, -87.755121152)",Austin
6,10224742,HY411435,2015-09-05T10:55:00.000+0000,082XX S LOOMIS BLVD,0610,BURGLARY,FORCIBLE ENTRY,RESIDENCE,False,False,614,21,71,05,1168430.0,1850165.0,2015,02/10/2018 03:50:01 PM,41.744378879,-87.658430635,"(41.744378879, -87.658430635)",Gresham
14,10224743,HY411629,2015-09-04T18:00:00.000+0000,021XX W CHURCHILL ST,0620,BURGLARY,UNLAWFUL ENTRY,RESIDENCE-GARAGE,False,False,1434,32,24,05,1161628.0,1912157.0,2015,02/10/2018 03:50:01 PM,41.914635603,-87.681630909,"(41.914635603, -87.681630909)",Shakespeare
10,10224744,HY411605,2015-09-05T13:00:00.000+0000,025XX W CERMAK RD,0860,THEFT,RETAIL THEFT,GROCERY FOOD STORE,True,False,1034,25,31,06,1159734.0,1889313.0,2015,09/17/2015 11:37:18 AM,41.851988885,-87.689219118,"(41.851988885, -87.689219118)",Ogden
12,10224745,HY411654,2015-09-05T11:30:00.000+0000,031XX W WASHINGTON BLVD,0320,ROBBERY,STRONGARM - NO WEAPON,STREET,False,True,1222,27,27,03,1155536.0,1900515.0,2015,02/10/2018 03:50:01 PM,41.88281374,-87.704325717,"(41.88281374, -87.704325717)",Near West
8,11645836,JC212333,2016-05-01T00:25:00.000+0000,055XX S ROCKWELL ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,824,15,63,11,,,2016,04/06/2019 04:04:43 PM,,,,Chicago Lawn


In [0]:
# rename District Name to be consistent with other columns

new_df=new_df.withColumnRenamed('District Name', 'district_name')

display(new_df)

district,id,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location,district_name
9,10224738,HY411648,2015-09-05T13:30:00.000+0000,043XX S WOOD ST,0486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,True,924,12,61,08B,1165074.0,1875917.0,2015,02/10/2018 03:50:01 PM,41.815117282,-87.669999562,"(41.815117282, -87.669999562)",Deering
15,10224739,HY411615,2015-09-04T11:30:00.000+0000,008XX N CENTRAL AVE,0870,THEFT,POCKET-PICKING,CTA BUS,False,False,1511,29,25,06,1138875.0,1904869.0,2015,02/10/2018 03:50:01 PM,41.895080471,-87.765400451,"(41.895080471, -87.765400451)",Austin
6,11646166,JC213529,2018-09-01T00:01:00.000+0000,082XX S INGLESIDE AVE,0810,THEFT,OVER $500,RESIDENCE,False,True,631,8,44,06,,,2018,04/06/2019 04:04:43 PM,,,,Gresham
14,10224740,HY411595,2015-09-05T12:45:00.000+0000,035XX W BARRY AVE,2023,NARCOTICS,POSS: HEROIN(BRN/TAN),SIDEWALK,True,False,1412,35,21,18,1152037.0,1920384.0,2015,02/10/2018 03:50:01 PM,41.937405765,-87.716649687,"(41.937405765, -87.716649687)",Shakespeare
15,10224741,HY411610,2015-09-05T13:00:00.000+0000,0000X N LARAMIE AVE,0560,ASSAULT,SIMPLE,APARTMENT,False,True,1522,28,25,08A,1141706.0,1900086.0,2015,02/10/2018 03:50:01 PM,41.881903443,-87.755121152,"(41.881903443, -87.755121152)",Austin
6,10224742,HY411435,2015-09-05T10:55:00.000+0000,082XX S LOOMIS BLVD,0610,BURGLARY,FORCIBLE ENTRY,RESIDENCE,False,False,614,21,71,05,1168430.0,1850165.0,2015,02/10/2018 03:50:01 PM,41.744378879,-87.658430635,"(41.744378879, -87.658430635)",Gresham
14,10224743,HY411629,2015-09-04T18:00:00.000+0000,021XX W CHURCHILL ST,0620,BURGLARY,UNLAWFUL ENTRY,RESIDENCE-GARAGE,False,False,1434,32,24,05,1161628.0,1912157.0,2015,02/10/2018 03:50:01 PM,41.914635603,-87.681630909,"(41.914635603, -87.681630909)",Shakespeare
10,10224744,HY411605,2015-09-05T13:00:00.000+0000,025XX W CERMAK RD,0860,THEFT,RETAIL THEFT,GROCERY FOOD STORE,True,False,1034,25,31,06,1159734.0,1889313.0,2015,09/17/2015 11:37:18 AM,41.851988885,-87.689219118,"(41.851988885, -87.689219118)",Ogden
12,10224745,HY411654,2015-09-05T11:30:00.000+0000,031XX W WASHINGTON BLVD,0320,ROBBERY,STRONGARM - NO WEAPON,STREET,False,True,1222,27,27,03,1155536.0,1900515.0,2015,02/10/2018 03:50:01 PM,41.88281374,-87.704325717,"(41.88281374, -87.704325717)",Near West
8,11645836,JC212333,2016-05-01T00:25:00.000+0000,055XX S ROCKWELL ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,824,15,63,11,,,2016,04/06/2019 04:04:43 PM,,,,Chicago Lawn


### Which district has the highest arrest rate?

In [0]:
display(new_df.filter(col('arrest')=='true').groupBy('district_name').count().sort(col('count').desc()))

district_name,count
Harrison,77555
Gresham,41540
Englewood,41501
Ogden,37360
Austin,35730
Grand Central,35122
Chicago Lawn,34655
South Chicago,34223
Calumet,30823
Deering,29088


## Crime Map for 2021
We want to show certain crime on a map for particular day in year 2021

In [0]:
# remove existing widgets

dbutils.widgets.removeAll()

In [0]:
new_df.createOrReplaceTempView('crimeTable')

### create a list to store all date in 2022

In [0]:
pd_date=new_df.filter(col('year')==2021).select(date_format("Date", "yyyy-MM-dd").alias('date')).distinct().orderBy('date').toPandas()
pd_date=pd_date.sort_values(by=['date'])
date=list(pd_date['date'])
date

Out[103]: ['2021-01-01',
 '2021-01-02',
 '2021-01-03',
 '2021-01-04',
 '2021-01-05',
 '2021-01-06',
 '2021-01-07',
 '2021-01-08',
 '2021-01-09',
 '2021-01-10',
 '2021-01-11',
 '2021-01-12',
 '2021-01-13',
 '2021-01-14',
 '2021-01-15',
 '2021-01-16',
 '2021-01-17',
 '2021-01-18',
 '2021-01-19',
 '2021-01-20',
 '2021-01-21',
 '2021-01-22',
 '2021-01-23',
 '2021-01-24',
 '2021-01-25',
 '2021-01-26',
 '2021-01-27',
 '2021-01-28',
 '2021-01-29',
 '2021-01-30',
 '2021-01-31',
 '2021-02-01',
 '2021-02-02',
 '2021-02-03',
 '2021-02-04',
 '2021-02-05',
 '2021-02-06',
 '2021-02-07',
 '2021-02-08',
 '2021-02-09',
 '2021-02-10',
 '2021-02-11',
 '2021-02-12',
 '2021-02-13',
 '2021-02-14',
 '2021-02-15',
 '2021-02-16',
 '2021-02-17',
 '2021-02-18',
 '2021-02-19',
 '2021-02-20',
 '2021-02-21',
 '2021-02-22',
 '2021-02-23',
 '2021-02-24',
 '2021-02-25',
 '2021-02-26',
 '2021-02-27',
 '2021-02-28',
 '2021-03-01',
 '2021-03-02',
 '2021-03-03',
 '2021-03-04',
 '2021-03-05',
 '2021-03-06',
 '2021-03-07',


### Create a list to store top 10 type of crime in 2021

In [0]:
from pyspark.sql.functions import *
pd_type=new_df.filter(col('year')==2021).groupBy("primary_type").count().orderBy(desc('count')).limit(10).toPandas()
pd_type=pd_type.sort_values(by=['primary_type'])
type=list(pd_type['primary_type'])
type

Out[104]: ['ASSAULT',
 'BATTERY',
 'BURGLARY',
 'CRIMINAL DAMAGE',
 'DECEPTIVE PRACTICE',
 'MOTOR VEHICLE THEFT',
 'OTHER OFFENSE',
 'ROBBERY',
 'THEFT',
 'WEAPONS VIOLATION']

### create dropdown list for date and primary type

In [0]:
dbutils.widgets.dropdown("Date", "2021-01-01", [str(x) for x in date])

dbutils.widgets.dropdown("Type", "ASSAULT", [str(x) for x in type])

### display number of crimes by week day for selected date and crime type

In [0]:
display(new_df.filter(col('primary_type')==getArgument("Type")).groupBy(date_format('date','E').alias('week_day')).count().orderBy('week_day'))

week_day,count
Fri,18897
Mon,17507
Sat,17831
Sun,17399
Thu,17198
Tue,16861
Wed,17001


## Display data on a map using folium

[see this link for more detail](https://python-visualization.github.io/folium/quickstart.html#Getting-Started)

In [0]:
display(new_df)

district,id,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location,district_name
9,10224738,HY411648,2015-09-05T13:30:00.000+0000,043XX S WOOD ST,0486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,True,924,12,61,08B,1165074.0,1875917.0,2015,02/10/2018 03:50:01 PM,41.815117282,-87.669999562,"(41.815117282, -87.669999562)",Deering
15,10224739,HY411615,2015-09-04T11:30:00.000+0000,008XX N CENTRAL AVE,0870,THEFT,POCKET-PICKING,CTA BUS,False,False,1511,29,25,06,1138875.0,1904869.0,2015,02/10/2018 03:50:01 PM,41.895080471,-87.765400451,"(41.895080471, -87.765400451)",Austin
6,11646166,JC213529,2018-09-01T00:01:00.000+0000,082XX S INGLESIDE AVE,0810,THEFT,OVER $500,RESIDENCE,False,True,631,8,44,06,,,2018,04/06/2019 04:04:43 PM,,,,Gresham
14,10224740,HY411595,2015-09-05T12:45:00.000+0000,035XX W BARRY AVE,2023,NARCOTICS,POSS: HEROIN(BRN/TAN),SIDEWALK,True,False,1412,35,21,18,1152037.0,1920384.0,2015,02/10/2018 03:50:01 PM,41.937405765,-87.716649687,"(41.937405765, -87.716649687)",Shakespeare
15,10224741,HY411610,2015-09-05T13:00:00.000+0000,0000X N LARAMIE AVE,0560,ASSAULT,SIMPLE,APARTMENT,False,True,1522,28,25,08A,1141706.0,1900086.0,2015,02/10/2018 03:50:01 PM,41.881903443,-87.755121152,"(41.881903443, -87.755121152)",Austin
6,10224742,HY411435,2015-09-05T10:55:00.000+0000,082XX S LOOMIS BLVD,0610,BURGLARY,FORCIBLE ENTRY,RESIDENCE,False,False,614,21,71,05,1168430.0,1850165.0,2015,02/10/2018 03:50:01 PM,41.744378879,-87.658430635,"(41.744378879, -87.658430635)",Gresham
14,10224743,HY411629,2015-09-04T18:00:00.000+0000,021XX W CHURCHILL ST,0620,BURGLARY,UNLAWFUL ENTRY,RESIDENCE-GARAGE,False,False,1434,32,24,05,1161628.0,1912157.0,2015,02/10/2018 03:50:01 PM,41.914635603,-87.681630909,"(41.914635603, -87.681630909)",Shakespeare
10,10224744,HY411605,2015-09-05T13:00:00.000+0000,025XX W CERMAK RD,0860,THEFT,RETAIL THEFT,GROCERY FOOD STORE,True,False,1034,25,31,06,1159734.0,1889313.0,2015,09/17/2015 11:37:18 AM,41.851988885,-87.689219118,"(41.851988885, -87.689219118)",Ogden
12,10224745,HY411654,2015-09-05T11:30:00.000+0000,031XX W WASHINGTON BLVD,0320,ROBBERY,STRONGARM - NO WEAPON,STREET,False,True,1222,27,27,03,1155536.0,1900515.0,2015,02/10/2018 03:50:01 PM,41.88281374,-87.704325717,"(41.88281374, -87.704325717)",Near West
8,11645836,JC212333,2016-05-01T00:25:00.000+0000,055XX S ROCKWELL ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,824,15,63,11,,,2016,04/06/2019 04:04:43 PM,,,,Chicago Lawn


In [0]:
%sh pip install folium

Collecting folium
  Downloading folium-0.12.1.post1-py2.py3-none-any.whl (95 kB)
Collecting branca>=0.3.0
  Downloading branca-0.5.0-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.5.0 folium-0.12.1.post1
You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.


In [0]:
# create a dataframe for crime data in 2021

df_2021=new_df.where('year=2021')

df_2021.count()

Out[109]: 207848

In [0]:
import folium
from pyspark.sql.functions import *

pd=df_2021.filter(col('latitude').isNotNull()).filter(date_format("Date", "yyyy-MM-dd")==getArgument("Date")).filter(col("primary_type")==getArgument("Type")).select("description", "latitude", 'longitude').toPandas()

description=pd['description']
latitude=pd['latitude']
longitude=pd['longitude']

m = folium.Map(location=[41.815, -87.669], zoom_start=12)

for i in range(0, len(description)):
     folium.Marker([latitude[i], longitude[i]], popup=description[i],
                   icon=folium.Icon(color="blue")).add_to(m)

display(m)