In [1]:
%config IPCompleter.greedy=True

In [2]:
import os
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import math

In [3]:
root_path = './'

## SF Covid-19 Dataset

In [4]:
cases_df = pd.read_csv(root_path + '/data/COVID-19_Cases_Summarized_by_Date__Transmission_and_Case_Disposition.csv')

In [5]:
cases_df.isna().sum()

Date                     0
Transmission Category    0
Case Disposition         0
Case Count               0
dtype: int64

#### We can verify in above query, 0 MISSING values were found 

## Pre-processing steps for SF Covid-19 Dataset

1.   Converting Date column into date-time dtype



In [6]:
cases_df["Date"] = pd.to_datetime(cases_df["Date"], format="%Y/%m/%d")

In [7]:
# Verify by uncommenting and toggling with ascending=True/False
# cases_df.sort_values(by="Date", ascending=False)

## Crime Dataset

In [8]:
crime_df = pd.read_csv(root_path + '/data/Police_Department_Incident_Reports__2018_to_Present.csv')

In [9]:
crime_df["Police District"].value_counts()

Central       53323
Northern      46002
Mission       45971
Southern      42958
Tenderloin    33563
Bayview       27926
Ingleside     24320
Taraval       23231
Richmond      20263
Park          16076
Out of SF      9303
Name: Police District, dtype: int64

In [10]:
crime_df.shape

(342936, 36)

In [11]:
crime_df.isna().sum()

Incident Datetime                                            0
Incident Date                                                0
Incident Time                                                0
Incident Year                                                0
Incident Day of Week                                         0
Report Datetime                                              0
Row ID                                                       0
Incident ID                                                  0
Incident Number                                              0
CAD Number                                               79225
Report Type Code                                             0
Report Type Description                                      0
Filed Online                                            269212
Incident Code                                                0
Incident Category                                          162
Incident Subcategory                                   

## Pre-processing steps for Crime Dataset:


### Considering only non-null columns and Incident columns

In [12]:
columns_to_consider = ["Incident Datetime", 
                       "Incident Date", 
                       "Incident Time", 
                       "Incident Year", 
                       "Incident Day of Week", 
                       "Report Datetime", 
                       "Row ID", 
                       "Incident ID", 
                       "Incident Number", 
                       "Report Type Code", 
                       "Report Type Description", 
                       "Incident Code", 
                       "Incident Category", 
                       "Incident Subcategory", 
                       "Incident Description", 
                       "Resolution", 
                       "Police District"]

In [13]:
crime_df = crime_df[columns_to_consider]

In [14]:
crime_df = crime_df[crime_df["Incident Category"].notna()]

In [15]:
# Verify whhether nulls are removed through the above query
# crime_df.isnull().sum()

In [16]:
crime_df["Incident Date"] = pd.to_datetime(crime_df["Incident Date"], format="%Y/%m/%d")

In [17]:
crime_df["Incident Datetime"] = pd.to_datetime(crime_df["Incident Datetime"], format="%Y/%m/%d %I:%M:%S %p")

## Chi-Square testing Before Lockdown/After Lockdown

In [285]:
from dateutil.relativedelta import relativedelta

earliest_start_date = crime_df["Incident Date"].min()

lockdown_effective_date = datetime.datetime(2020, 3, 17)
first_corona_date = cases_df["Date"].min()
# earliest_start_date = first_corona_date
latest_date = cases_df["Date"].max()
previous_year_start = lockdown_effective_date - relativedelta(years=1)
previous_year_end = latest_date - relativedelta(years=1)

t = crime_df[["Incident ID", "Incident Category"]].drop_duplicates()
t = t["Incident Category"].value_counts()
most_frequent_crimes = list(t[:2].index)

most_frequent_crimes = ["Larceny Theft", "Malicious Mischief"]

In [286]:
before_lockdown_crime = crime_df[(crime_df["Incident Date"] >= previous_year_start) & (crime_df["Incident Date"] <= previous_year_end)]
after_lockdown_crime = crime_df[(crime_df["Incident Date"] >= lockdown_effective_date) & (crime_df["Incident Date"] <= latest_date)]

# Avoiding double counting same incident-category
before_lockdown_crime = before_lockdown_crime[["Incident ID", "Incident Category"]].drop_duplicates()
after_lockdown_crime = after_lockdown_crime[["Incident ID", "Incident Category"]].drop_duplicates()

before_days = (previous_year_end - previous_year_start).days + 1
after_days = (latest_date - lockdown_effective_date).days + 1

# Creating dictionary to hold mean crime types before and after lockdown
before_mean_crimes = before_lockdown_crime["Incident Category"].value_counts().divide(before_days).to_dict()
after_mean_crimes = after_lockdown_crime["Incident Category"].value_counts().divide(after_days).to_dict()

In [287]:
# There are some crimes that never occur, neither before lockdown, nor after lockdown
# because of which the expected value becomes 0
# to avoid division by 0 error we are considering only those crime-types where crime has occured
crimes_to_consider = []
for crime in most_frequent_crimes:
    if before_mean_crimes.get(crime, 0.0) == 0.0 and after_mean_crimes.get(crime, 0.0) == 0.0:
        continue
    crimes_to_consider.append(crime)

In [288]:
crimes_to_consider

['Larceny Theft', 'Malicious Mischief']

In [298]:
chi_square_matrix = []
i = 0
j = 0
for crime_dict in [before_mean_crimes, after_mean_crimes]:
    row = []
    for crime in crimes_to_consider:
        row.append(crime_dict.get(crime, 0.0))
    
    chi_square_matrix.append(row)

row_totals = [0.0 for i in range(len(chi_square_matrix))]
column_totals = [0.0 for i in range(2)]

for i in range(2):
    row_sum = 0.0
    for j in range(len(chi_square_matrix)):
        column_totals[j] = column_totals[j] + chi_square_matrix[i][j]
        row_sum += chi_square_matrix[i][j]
    
    row_totals[i] = row_sum

total = float(np.sum(np.array(column_totals)))

In [299]:
q_statistic = 0.0
for i in range(2):
    for j in range(len(chi_square_matrix)):
        expected = row_totals[i] * column_totals[j]/total
        observed = chi_square_matrix[i][j]
        print(str(expected) + " " + str(observed))
        q_statistic = q_statistic + (((expected - observed)**2) / expected)
        
degrees_of_freedom = (len(crime_types) - 1) * (2 - 1)

110.8489788045607 116.51020408163265
29.355102828092377 23.693877551020407
56.4571436444189 50.795918367346935
14.951019620887223 20.612244897959183


In [300]:
chi_square_matrix

[[116.51020408163265, 23.693877551020407],
 [50.795918367346935, 20.612244897959183]]

In [301]:
q_statistic

4.0922217631055355

In [302]:
row_totals

[140.20408163265307, 71.40816326530611]

In [303]:
column_totals

[167.30612244897958, 44.30612244897959]

In [304]:
total

211.61224489795916

### For fetching p-value from Chi-square distribution we are using scipy library

In [305]:
from scipy import stats
p_value = 1 - stats.chi2.cdf(q_statistic, degrees_of_freedom)

In [327]:
print(str(p_value) + " " + str(q_statistic))

1.0 4.0922217631055355


### For such a high p-value observed (0.99) which is greater than significance threshold (0.05)
### we FAIL-TO-REJECT Null Hypothesis and conclude that crime is independent of before and after lockdown changes

In [258]:
from scipy.stats import chi2_contingency
arr = np.array(chi_square_matrix)
chi2_contingency(arr)

(3.4012920000759284,
 0.06514537433648841,
 1,
 array([[110.8489788 ,  56.45714364],
        [ 29.35510283,  14.95101962]]))

In [320]:
cases_df["Case Disposition"].value_counts()

Confirmed    154
Death         28
Name: Case Disposition, dtype: int64

In [323]:
cases_df["Transmission Category"].value_counts()

Community       65
From Contact    65
Unknown         52
Name: Transmission Category, dtype: int64

In [324]:
cases_df.groupby("Transmission Category").sum()

Unnamed: 0_level_0,Case Count
Transmission Category,Unnamed: 1_level_1
Community,859
From Contact,466
Unknown,460


In [325]:
lockdown_effective_date

datetime.datetime(2020, 3, 17, 0, 0)

In [326]:
latest_date

Timestamp('2020-05-04 00:00:00')