In [1]:
%config IPCompleter.greedy=True

In [2]:
import os
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import math

In [3]:
root_path = './'

## SF Covid-19 Dataset

In [4]:
cases_df = pd.read_csv(root_path + '/data/COVID-19_Cases_Summarized_by_Date__Transmission_and_Case_Disposition.csv')

In [5]:
cases_df.isna().sum()

Date                     0
Transmission Category    0
Case Disposition         0
Case Count               0
dtype: int64

#### We can verify in above query, 0 MISSING values were found 

## Pre-processing steps for SF Covid-19 Dataset

1.   Converting Date column into date-time dtype



In [6]:
cases_df["Date"] = pd.to_datetime(cases_df["Date"], format="%Y/%m/%d")

In [7]:
# Verify by uncommenting and toggling with ascending=True/False
# cases_df.sort_values(by="Date", ascending=False)

## Crime Dataset

In [8]:
crime_df = pd.read_csv(root_path + '/data/Police_Department_Incident_Reports__2018_to_Present.csv')

In [9]:
crime_df["Police District"].value_counts()

Central       53323
Northern      46002
Mission       45971
Southern      42958
Tenderloin    33563
Bayview       27926
Ingleside     24320
Taraval       23231
Richmond      20263
Park          16076
Out of SF      9303
Name: Police District, dtype: int64

In [10]:
crime_df.shape

(342936, 36)

In [11]:
crime_df.isna().sum()

Incident Datetime                                            0
Incident Date                                                0
Incident Time                                                0
Incident Year                                                0
Incident Day of Week                                         0
Report Datetime                                              0
Row ID                                                       0
Incident ID                                                  0
Incident Number                                              0
CAD Number                                               79225
Report Type Code                                             0
Report Type Description                                      0
Filed Online                                            269212
Incident Code                                                0
Incident Category                                          162
Incident Subcategory                                   

## Pre-processing steps for Crime Dataset:


### Considering only non-null columns and Incident columns

In [12]:
columns_to_consider = ["Incident Datetime", 
                       "Incident Date", 
                       "Incident Time", 
                       "Incident Year", 
                       "Incident Day of Week", 
                       "Report Datetime", 
                       "Row ID", 
                       "Incident ID", 
                       "Incident Number", 
                       "Report Type Code", 
                       "Report Type Description", 
                       "Incident Code", 
                       "Incident Category", 
                       "Incident Subcategory", 
                       "Incident Description", 
                       "Resolution", 
                       "Police District"]

In [13]:
crime_df = crime_df[columns_to_consider]

In [14]:
crime_df = crime_df[crime_df["Incident Category"].notna()]

In [15]:
# Verify whhether nulls are removed through the above query
# crime_df.isnull().sum()

In [16]:
crime_df["Incident Date"] = pd.to_datetime(crime_df["Incident Date"], format="%Y/%m/%d")

In [17]:
crime_df["Incident Datetime"] = pd.to_datetime(crime_df["Incident Datetime"], format="%Y/%m/%d %I:%M:%S %p")

In [18]:
crime_df.dtypes

Incident Datetime          datetime64[ns]
Incident Date              datetime64[ns]
Incident Time                      object
Incident Year                       int64
Incident Day of Week               object
Report Datetime                    object
Row ID                              int64
Incident ID                         int64
Incident Number                     int64
Report Type Code                   object
Report Type Description            object
Incident Code                       int64
Incident Category                  object
Incident Subcategory               object
Incident Description               object
Resolution                         object
Police District                    object
dtype: object

## Chi-Square testing Before Lockdown/After Lockdown

In [192]:
from dateutil.relativedelta import relativedelta

earliest_start_date = crime_df["Incident Date"].min()

lockdown_effective_date = datetime.datetime(2020, 3, 17)
first_corona_date = cases_df["Date"].min()
# earliest_start_date = first_corona_date
latest_date = cases_df["Date"].max()
previous_year_start = lockdown_effective_date - relativedelta(years=1)
previous_year_end = latest_date - relativedelta(years=1)

In [214]:
before_lockdown_crime = crime_df[(crime_df["Incident Date"] >= previous_year_start) & (crime_df["Incident Date"] <= previous_year_end)]
after_lockdown_crime = crime_df[(crime_df["Incident Date"] >= lockdown_effective_date) & (crime_df["Incident Date"] <= latest_date)]

# Avoiding double counting same incident-category
before_lockdown_crime = before_lockdown_crime[["Incident ID", "Incident Category"]].drop_duplicates()
after_lockdown_crime = after_lockdown_crime[["Incident ID", "Incident Category"]].drop_duplicates()

before_days = (previous_year_end - previous_year_start).days + 1
after_days = (latest_date - lockdown_effective_date).days + 1

# Creating dictionary to hold mean crime types before and after lockdown
before_mean_crimes = before_lockdown_crime["Incident Category"].value_counts().divide(before_days).to_dict()
after_mean_crimes = after_lockdown_crime["Incident Category"].value_counts().divide(after_days).to_dict()

In [215]:
# There are some crimes that never occur, neither before lockdown, nor after lockdown
# because of which the expected value becomes 0
# to avoid division by 0 error we are considering only those crime-types where crime has occured
crimes_to_consider = []
for crime in crime_types:
    if before_mean_crimes.get(crime, 0.0) == 0.0 and after_mean_crimes.get(crime, 0.0) == 0.0:
        continue
    crimes_to_consider.append(crime)

In [216]:
chi_square_matrix = []
i = 0
j = 0
for crime in crimes_to_consider:
    row = []
    for crime_dict in [before_mean_crimes, after_mean_crimes]:
        row.append(crime_dict.get(crime, 0.0))
    
    chi_square_matrix.append(row)

row_totals = [0.0 for i in range(len(chi_square_matrix))]
column_totals = [0.0 for i in range(2)]

for i in range(len(chi_square_matrix)):
    row_sum = 0.0
    for j in range(2):
        column_totals[j] = column_totals[j] + chi_square_matrix[i][j]
        row_sum += chi_square_matrix[i][j]
    
    row_totals[i] = row_sum

total = float(np.sum(np.array(column_totals)))

In [217]:
q_statistic = 0.0
for i in range(len(chi_square_matrix)):
    for j in range(2):
        expected = row_totals[i] * column_totals[j]/total
        if expected == 0.0:
            continue
        observed = chi_square_matrix[i][j]
        q_statistic = q_statistic + (((expected - observed)**2) / expected)
        
degrees_of_freedom = (len(crime_types) - 1) * (2 - 1)

### For fetching p-value from Chi-square distribution we are using scipy library

In [218]:
from scipy import stats
p_value = 1 - stats.chi2.cdf(q_statistic, degrees_of_freedom)

In [219]:
p_value

0.9976766497209638

In [210]:
q_statistic

26.311525071340164

### For such a high p-value observed (0.99) which is greater than significance threshold (0.05)
### we FAIL-TO-REJECT Null Hypothesis and conclude that crime is independent of before and after lockdown changes

In [211]:
from scipy.stats import chi2_contingency
arr = np.array(chi_square_matrix)
chi2_contingency(arr)

(26.311525071340164,
 0.9936668488526602,
 47,
 array([[8.48792268e+00, 5.04268956e+00],
        [2.20199502e+00, 1.30820906e+00],
        [2.47852463e+01, 1.47249578e+01],
        [1.20853680e+01, 7.17993808e+00],
        [4.25036249e+00, 2.52514771e+00],
        [7.47654125e+00, 4.44182610e+00],
        [1.04953228e+02, 6.23528945e+01],
        [2.88947835e+01, 1.71664409e+01],
        [1.21365772e+01, 7.21036155e+00],
        [2.29161110e+01, 1.36145012e+01],
        [9.78095464e+00, 5.81088209e+00],
        [2.29801225e+01, 1.36525306e+01],
        [4.48080383e+00, 2.66205331e+00],
        [1.35704345e+00, 8.06221861e-01],
        [2.77937860e+01, 1.65123364e+01],
        [1.86273416e+01, 1.10665359e+01],
        [5.73542890e+00, 3.40742824e+00],
        [3.41821321e+00, 2.03076639e+00],
        [3.37980632e+00, 2.00794879e+00],
        [1.36600505e+01, 8.11545967e+00],
        [9.07682833e+00, 5.39255943e+00],
        [8.61594565e+00, 5.11874823e+00],
        [7.09247235e+00, 4.21