##Chicago Crime Data Analysis
This project is based on Chicago crime data downloaded from [Chicago Data Portal](https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-Present/ijzp-q8t2). The data set include the crime in Chicago from 2001 to present

In [0]:
%fs ls /mnt/isa460/data/chicago_crime

In [0]:
# import csv data from S3 storage folder

df=spark.read.csv("/mnt/isa460/data/chicago_crime/Crimes2001_2021.csv", header=True, inferSchema=True)


In [0]:
df.printSchema()

In [0]:
display(df)

In [0]:
df.count()

##ETL 

### Replace space in column names as _, change the names to all lower cases letters.

In [0]:
from pyspark.sql.functions import *

columns=df.columns

for col in columns:
  new_col=col.replace(" ", "_").lower()
  df=df.withColumnRenamed(col, new_col)


###Change date from string type to timestamp type

In [0]:
from pyspark.sql.functions import to_timestamp
help(to_timestamp)

In [0]:
from pyspark.sql.functions import to_timestamp, col
crime_df = df.withColumn('Date',to_timestamp(col('Date'),'MM/dd/yyyy hh:mm:ss a'))
crime_df.printSchema()

In [0]:
display(crime_df)

### store transformed crime data as parquet data format

In [0]:
crime_df.write.parquet("/mnt/isa460/data/chicago_crime/parquet")

check the stored parquet file

In [0]:
%fs ls /mnt/isa460/data/chicago_crime/parquet

# Basic Analysis

In [0]:
# load crime data (in parquet format)

crime_df=spark.read.parquet("/mnt/isa460/data/chicago_crime/parquet")

## number of crimes by year

In [0]:
display(crime_df.groupBy("year").count().orderBy("year"))

## number of crimes by year and by month

In [0]:
display(crime_df.filter(year('date')>=2015).groupBy(year('date').alias("Year"), month('date').alias("Month")).count().orderBy("Year", "Month"))

#display(crime_df.groupBy(year('date').alias("Year"), month('date').alias("Month")).count().orderBy("Year", "Month"))

###What are the top 10 number of reported crimes by primary type, in descending order of occurence?

In [0]:
display(crime_df.groupBy('primary_type').count().sort(col('count').desc()).limit(10))

### What are the top 5 reported crimes by primary type for each year?

In [0]:
from pyspark.sql.window import Window

display(crime_df.groupBy(year('date').alias("year"), 'primary_type').count().withColumn("rank", rank().over(Window.partitionBy("year").orderBy(desc("count")))).filter(col("rank")<=5))

###Find the percentage of reported crimes that results in an arrest

In [0]:
crime_df.filter(col('arrest')=='true').count()/crime_df.count()

### find the percentage of reported crimes that results in arrest by year.

In [0]:
from pyspark.sql.functions import col

display(crime_df.groupBy('year').pivot("arrest").count().orderBy('year').withColumn("percent_arrest", col('true')/(col('false')+col('true'))))


In [0]:
import pandas as pd

crimeByYear=crime_df.groupBy('year').agg(count("year").alias("crime_count")).orderBy("year").toPandas()
arrestByYear=crime_df.filter(col("arrest")=="true").groupBy('year').agg(count("year").alias("arrest_count")).orderBy("year").select("arrest_count").toPandas()

crime=pd.concat([crimeByYear, arrestByYear], axis=1)
crime['arrest_rate']=crime['arrest_count']/crime['crime_count']

display(spark.createDataFrame(crime))

### What are the top 10 words appearing in the deacription of the crime?

In [0]:
from pyspark.sql.functions import *
crime_df.withColumn("word", explode(split("description", " "))).groupBy("word").count().orderBy(desc("count")).show()

#Working with joins

In [0]:
#The reported crimes dataset has only the district number. Add the district name by joining with the police station dataset.

# load the policy dataset, only keep District and District Name

station_df = spark.read.csv('/mnt/isa460/data/chicago_crime/policestation.csv',header=True, inferSchema=True)
station_df=station_df.select('District', "District Name")
station_df.printSchema()

In [0]:
#Join police staion with crime data.
new_df=crime_df.join(station_df, crime_df['district']==station_df['district'], 'inner')

In [0]:
display(new_df)

### Which district has the highest arrest rate?

In [0]:
from pyspark.sql.functions import *
new_df.filter(col('Arrest')=='true').groupBy('District Name').count().sort(col('count').desc()).show()

##We want to show certain crime on a map for particular day

In [0]:
dbutils.widgets.removeAll()

In [0]:
crime_df.createOrReplaceTempView('crime')

## create a list to store all date in 2021

In [0]:
pd_date=crime_df.filter(col('year')==2021).select(date_format("Date", "yyyy-MM-dd").alias('date')).distinct().orderBy('date').toPandas()
pd_date=pd_date.sort_values(by=['date'])
date=list(pd_date['date'])
date

## Create a list to store top 10 type of crime in 2021

In [0]:
from pyspark.sql.functions import *
pd_type=crime_df.filter(col('year')==2021).groupBy("primary_type").count().orderBy(desc('count')).limit(10).toPandas()
pd_type=pd_type.sort_values(by=['primary_type'])
type=list(pd_type['primary_type'])
type

## create dropdown list for date and primary type

In [0]:
dbutils.widgets.dropdown("Date", "2021-10-03", [str(x) for x in date])

dbutils.widgets.dropdown("Type", "ASSAULT", [str(x) for x in type])

Find the day of the week with the most reported crime by certain primary type

In [0]:
display(crime_df.filter(col('primary_type')==getArgument("Type")).groupBy(date_format('date','E').alias('week day')).count().orderBy('count', ascending=False))

## Display data on a map using folium

[see this link for more detail](https://python-visualization.github.io/folium/quickstart.html#Getting-Started)

In [0]:
display(crime_df)

In [0]:
import folium
from pyspark.sql.functions import *

pd=crime_df.filter(date_format("Date", "yyyy-MM-dd")==getArgument("Date")).filter(col("longitude").isNotNull()).filter(col("primary_type")==getArgument("Type")).select("description", "latitude", 'longitude').toPandas()

description=pd['description']
latitude=pd['latitude']
longitude=pd['longitude']

m = folium.Map(location=[41.876, -87.628], zoom_start=12)

for i in range(0, len(description)):
     folium.Marker([latitude[i], longitude[i]], popup=description[i],
                   icon=folium.Icon(color="blue")).add_to(m)

display(m)