### Chapter 4 Exercises

In [10]:
import polars as pl

pl.Config.set_fmt_str_lengths(50)

polars.config.Config

1.	Use Polars to read the Parquet file nypd_crimes_from_2018 from the dataset folder. 
Filter the dataset for the dates of 31st December 2019 and 1st January 2020.
We will use this filtered dataset for the rest of the exercises

In [11]:
nypd = pl.read_parquet("../datasets/nypd_crimes_from_2018.parquet")

nypd = nypd.with_columns(
    pl.col("complaint_datetime").str.to_datetime("%Y-%m-%d %H:%M:%S")
)

In [12]:
from datetime import datetime

nypd_filtered = nypd.filter(
    pl.col("complaint_datetime").is_between(
        datetime(2019, 12, 31), datetime(2020, 1, 2)
    )
)

2.	Take the dataset from question 1, group it dataset by crime_category, and add 2 list columns: one with a list of precincts, and another with a list of boroughs. Do not worry about duplicated values in the lists.

In [14]:
nypd_grouped = nypd_filtered.group_by("crime_category").agg("precinct", "borough")

nypd_grouped

crime_category,precinct,borough
str,list[i64],list[str]
"""MISDEMEANOR""","[19, 48, … 44]","[""MANHATTAN"", ""BRONX"", … ""BRONX""]"
"""VIOLATION""","[20, 42, … 73]","[""MANHATTAN"", ""BRONX"", … ""BROOKLYN""]"
"""FELONY""","[105, 114, … 72]","[""QUEENS"", ""QUEENS"", … ""BROOKLYN""]"


3.	Start with the grouped dataframe of question 2, then add the following columns:

- 'borough_unique' with the list of boroughs without duplicated values
- 'precinct_count' with the count of precincts (you need to remove duplicated values first)
- 'top_borough' with the borough having the most crimes (hint: check out the 'mode' method)

In [15]:
nypd_grouped.with_columns(
    borough_unique=pl.col("borough").list.unique(),
    precinct_count=pl.col("precinct").list.unique().list.len(),
    top_borough=pl.col("borough").list.eval(pl.element().mode()),
)

crime_category,precinct,borough,borough_unique,precinct_count,top_borough
str,list[i64],list[str],list[str],u32,list[str]
"""MISDEMEANOR""","[19, 48, … 44]","[""MANHATTAN"", ""BRONX"", … ""BRONX""]","[""BROOKLYN"", ""BRONX"", … ""MANHATTAN""]",77,"[""BROOKLYN""]"
"""VIOLATION""","[20, 42, … 73]","[""MANHATTAN"", ""BRONX"", … ""BROOKLYN""]","[""QUEENS"", ""STATEN ISLAND"", … ""BROOKLYN""]",74,"[""QUEENS""]"
"""FELONY""","[105, 114, … 72]","[""QUEENS"", ""QUEENS"", … ""BROOKLYN""]","[""MANHATTAN"", ""QUEENS"", … ""BRONX""]",76,"[""BROOKLYN""]"


4.	Go back to the filtered dataframe of question 1, then filter it to keep only rows where crime_description contains the word 'COMMERCIAL'

In [16]:
nypd_filtered.filter(pl.col("crime_description").str.contains("COMMERCIAL"))

complaint_datetime,complaint_id,precinct,borough,crime_category,crime_description
datetime[μs],str,i64,str,str,str
2019-12-31 05:45:00,"""102079547""",18,"""MANHATTAN""","""FELONY""","""BURGLARY,COMMERCIAL,DAY"""
2020-01-01 02:30:00,"""268949120""",114,"""QUEENS""","""FELONY""","""BURGLARY,COMMERCIAL,NIGHT"""
2020-01-01 08:35:00,"""938871036""",13,"""MANHATTAN""","""FELONY""","""BURGLARY,COMMERCIAL,DAY"""
2019-12-31 15:15:00,"""207059189""",6,"""MANHATTAN""","""FELONY""","""BURGLARY,COMMERCIAL,DAY"""
2020-01-01 06:30:00,"""526074457""",109,"""QUEENS""","""FELONY""","""ROBBERY,COMMERCIAL UNCLASSIFIED"""
…,…,…,…,…,…
2019-12-31 04:14:00,"""600601396""",6,"""MANHATTAN""","""FELONY""","""BURGLARY,COMMERCIAL,NIGHT"""
2019-12-31 08:45:00,"""703654894""",25,"""MANHATTAN""","""FELONY""","""BURGLARY,COMMERCIAL,DAY"""
2019-12-31 04:20:00,"""937278379""",78,"""BROOKLYN""","""FELONY""","""BURGLARY,COMMERCIAL,NIGHT"""
2019-12-31 01:40:00,"""628817315""",14,"""MANHATTAN""","""FELONY""","""BURGLARY,COMMERCIAL,NIGHT"""


5.	Use the filtered dataframe of question 1, group it by hour using truncate, sort it, then add the following columns

- 'crime_count' with the count of crimes in the hour
- 'crime_count_one_hour_earlier' with the count of crimes one hour earlier
- 'crime_count_24_hours_earlier' with the count of crimes 24 hours earlier
- 'crime_count_difference_one_hour' with the change in the count of crimes vs one hour earlier

Assume that the dataset has data for every hour

In [17]:
(
    nypd_filtered.group_by(pl.col("complaint_datetime").dt.truncate("1h"))
    .len(name="count")
    .sort("complaint_datetime")
    .rename({"count": "crime_count"})
    .with_columns(
        crime_count_one_hour_earlier=pl.col("crime_count").shift(1),
        crime_count_24_hours_earlier=pl.col("crime_count").shift(24),
        crime_count_difference_one_hour=pl.col("crime_count").diff(1),
    )
)

complaint_datetime,crime_count,crime_count_one_hour_earlier,crime_count_24_hours_earlier,crime_count_difference_one_hour
datetime[μs],u32,u32,u32,i64
2019-12-31 00:00:00,50,,,
2019-12-31 01:00:00,38,50,,-12
2019-12-31 02:00:00,18,38,,-20
2019-12-31 03:00:00,18,18,,0
2019-12-31 04:00:00,12,18,,-6
…,…,…,…,…
2020-01-01 20:00:00,57,51,60,6
2020-01-01 21:00:00,43,57,49,-14
2020-01-01 22:00:00,36,43,69,-7
2020-01-01 23:00:00,43,36,64,7


6.	Use the filtered dataframe of question 1, group it by hour using truncate, sort it, then add the following columns

- 'crime_count' with the count of crimes in the hour
- 'cumulative_crime_count' with the cumulative count of crimes
- 'cumulative_average_crime' with the cumulative average of the crime count
- 'cumulative_median_crime' with the cumulative median of the crime count

Assume that the dataset has data for every hour

In [18]:
(
    nypd_filtered.group_by(pl.col("complaint_datetime").dt.truncate("1h"))
    .len(name="count")
    .sort("complaint_datetime")
    .rename({"count": "crime_count"})
    .with_columns(
        cumulative_crime_count=pl.col("crime_count").cum_sum(),
        cumulative_average_crime=pl.col("crime_count").cumulative_eval(
            pl.element().mean()
        ),
        cumulative_median_crime=pl.col("crime_count").cumulative_eval(
            pl.element().median()
        ),
    )
)

complaint_datetime,crime_count,cumulative_crime_count,cumulative_average_crime,cumulative_median_crime
datetime[μs],u32,u32,f64,f64
2019-12-31 00:00:00,50,50,50.0,50.0
2019-12-31 01:00:00,38,88,44.0,44.0
2019-12-31 02:00:00,18,106,35.333333,38.0
2019-12-31 03:00:00,18,124,31.0,28.0
2019-12-31 04:00:00,12,136,27.2,18.0
…,…,…,…,…
2020-01-01 20:00:00,57,2835,63.0,55.0
2020-01-01 21:00:00,43,2878,62.565217,54.5
2020-01-01 22:00:00,36,2914,62.0,54.0
2020-01-01 23:00:00,43,2957,61.604167,53.5


7.	Use the filtered dataframe of question 1, group it by hour using truncate, sort it, then add the following columns

- 'crime_count' with the number of crimes in the hour
- 'rolling_5hours_crime_count' with the number of crimes for the past 5 hours
- 'rolling_5hours_average_crime' with the average number of crimes of the past 5 hours
- 'rolling_5hours_median_crime' with the median number of crimes of the past 5 hours
- 'crimes_past_5hours' with a list of the number of crimes for the past 5 hours

Assume that the dataset has data for every hour

In [20]:
(
    nypd_filtered.group_by(pl.col("complaint_datetime").dt.truncate("1h"))
    .len(name="count")
    .sort("complaint_datetime")
    .rename({"count": "crime_count"})
    .group_by_rolling(index_column="complaint_datetime", period="5h")
    .agg(
        rolling_5hours_crime_count=pl.col("crime_count").sum(),
        rolling_5hours_average_crime=pl.col("crime_count").mean(),
        rolling_5hours_median_crime=pl.col("crime_count").median(),
        crimes_past_5hours=pl.col("crime_count"),
    )
)

  nypd_filtered.group_by(pl.col("complaint_datetime").dt.truncate("1h"))


complaint_datetime,rolling_5hours_crime_count,rolling_5hours_average_crime,rolling_5hours_median_crime,crimes_past_5hours
datetime[μs],u32,f64,f64,list[u32]
2019-12-31 00:00:00,50,50.0,50.0,[50]
2019-12-31 01:00:00,88,44.0,44.0,"[50, 38]"
2019-12-31 02:00:00,106,35.333333,38.0,"[50, 38, 18]"
2019-12-31 03:00:00,124,31.0,28.0,"[50, 38, … 18]"
2019-12-31 04:00:00,136,27.2,18.0,"[50, 38, … 12]"
…,…,…,…,…
2020-01-01 20:00:00,308,61.6,60.0,"[61, 60, … 57]"
2020-01-01 21:00:00,290,58.0,57.0,"[60, 79, … 43]"
2020-01-01 22:00:00,266,53.2,51.0,"[79, 51, … 36]"
2020-01-01 23:00:00,230,46.0,43.0,"[51, 57, … 43]"
