In [1]:
%config IPCompleter.greedy=True

In [2]:
import os
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

In [3]:
root_path = './'

## SF Covid-19 Dataset

In [4]:
cases_df = pd.read_csv(root_path + '/data/COVID-19_Cases_Summarized_by_Date__Transmission_and_Case_Disposition.csv')

In [5]:
cases_df.isna().sum()

Date                     0
Transmission Category    0
Case Disposition         0
Case Count               0
dtype: int64

#### We can verify in above query, 0 MISSING values were found 

## Pre-processing steps for SF Covid-19 Dataset

1.   Converting Date column into date-time dtype



In [6]:
cases_df["Date"] = pd.to_datetime(cases_df["Date"], format="%Y/%m/%d")

In [7]:
# Verify by uncommenting and toggling with ascending=True/False
# cases_df.sort_values(by="Date", ascending=False)

## Crime Dataset

In [8]:
crime_df = pd.read_csv(root_path + '/data/Police_Department_Incident_Reports__2018_to_Present.csv')

In [9]:
crime_df.shape

(342936, 36)

In [10]:
crime_df.isna().sum()

Incident Datetime                                            0
Incident Date                                                0
Incident Time                                                0
Incident Year                                                0
Incident Day of Week                                         0
Report Datetime                                              0
Row ID                                                       0
Incident ID                                                  0
Incident Number                                              0
CAD Number                                               79225
Report Type Code                                             0
Report Type Description                                      0
Filed Online                                            269212
Incident Code                                                0
Incident Category                                          162
Incident Subcategory                                   

## Pre-processing steps for Crime Dataset:


### Considering only non-null columns and Incident columns

In [11]:
columns_to_consider = ["Incident Datetime", 
                       "Incident Date", 
                       "Incident Time", 
                       "Incident Year", 
                       "Incident Day of Week", 
                       "Report Datetime", 
                       "Row ID", 
                       "Incident ID", 
                       "Incident Number", 
                       "Report Type Code", 
                       "Report Type Description", 
                       "Incident Code", 
                       "Incident Category", 
                       "Incident Subcategory", 
                       "Incident Description", 
                       "Resolution", 
                       "Police District"]

In [12]:
crime_df = crime_df[columns_to_consider]

In [13]:
crime_df = crime_df[crime_df["Incident Category"].notna()]

In [14]:
# Verify whhether nulls are removed through the above query
# crime_df.isnull().sum()

In [15]:
crime_df["Incident Date"] = pd.to_datetime(crime_df["Incident Date"], format="%Y/%m/%d")

In [16]:
crime_df["Incident Datetime"] = pd.to_datetime(crime_df["Incident Datetime"], format="%Y/%m/%d %I:%M:%S %p")

In [17]:
crime_df.dtypes

Incident Datetime          datetime64[ns]
Incident Date              datetime64[ns]
Incident Time                      object
Incident Year                       int64
Incident Day of Week               object
Report Datetime                    object
Row ID                              int64
Incident ID                         int64
Incident Number                     int64
Report Type Code                   object
Report Type Description            object
Incident Code                       int64
Incident Category                  object
Incident Subcategory               object
Incident Description               object
Resolution                         object
Police District                    object
dtype: object

## Visualizations

In [18]:
crime_df.head(1)

Unnamed: 0,Incident Datetime,Incident Date,Incident Time,Incident Year,Incident Day of Week,Report Datetime,Row ID,Incident ID,Incident Number,Report Type Code,Report Type Description,Incident Code,Incident Category,Incident Subcategory,Incident Description,Resolution,Police District
0,2020-02-03 14:45:00,2020-02-03,14:45,2020,Monday,2020/02/03 05:50:00 PM,89881675000,898816,200085557,II,Initial,75000,Missing Person,Missing Person,Found Person,Open or Active,Taraval


In [19]:
def convert_agg_df_into_flatten_df(df):
  df = df.reset_index()
  df.columns = [' '.join(col).strip() for col in df.columns.values]

  return df

# df = crime_df.groupby(['Incident Date'])['Incident Date'].agg(["count"]).reset_index()
# df.head()

#### Unique first Reports below are basically unique crimes that have been occured (we filtered according to the below criterias because we want to avoid counting same crime twice with other report types i.e. supplemental reports)

In [20]:
unique_first_reports = crime_df[(crime_df["Report Type Description"] == 'Initial') |
                         (crime_df["Report Type Description"] == 'Initial Supplement') | 
                         (crime_df["Report Type Description"] == 'Vehicle Initial') |
                         (crime_df["Report Type Description"] == 'Coplogic Initial')]

unique_first_reports["Report Type Description"].value_counts()

Initial               213832
Coplogic Initial       65508
Initial Supplement     27882
Vehicle Initial        16277
Name: Report Type Description, dtype: int64

In [21]:
df = unique_first_reports['Incident Date'].value_counts().to_frame().reset_index()
df = df.rename(columns={"index": "date", "Incident Date": "count"})
df = df.sort_values(by=["date"])
df.head()

Unnamed: 0,date,count
14,2018-01-01,478
453,2018-01-02,380
265,2018-01-03,405
107,2018-01-04,433
156,2018-01-05,424


# Required Inference 2

In [22]:
from datetime import timedelta

## Inferences for Deaths

In [23]:
death_cases = cases_df[cases_df['Case Disposition'] == 'Death']
death_counts = death_cases.groupby('Date')['Case Count'].sum().reset_index()

In [24]:
latest_date = death_counts.sort_values(by='Date', ascending=False)['Date'].iloc[0]

last_week_date = latest_date - timedelta(weeks=1)
second_last_week_date = last_week_date - timedelta(weeks=1)

In [25]:
last_week_death_counts = death_counts[death_counts['Date'] > last_week_date]
X = []
for i in range(1, 8):
    curr_date = (last_week_date + timedelta(days=i))
    if curr_date not in set(last_week_death_counts['Date']):
        X.append(0)
    else:
        X.append(last_week_death_counts[last_week_death_counts['Date'] == curr_date]['Case Count'].iloc[0])
X = np.array(X)

In [26]:
second_last_week_death_counts = death_counts.loc[(death_counts['Date'] > second_last_week_date) & (death_counts['Date'] <= last_week_date)]
Y = []
for i in range(1, 8):
    curr_date = (second_last_week_date + timedelta(days=i))
    if curr_date not in set(second_last_week_death_counts['Date']):
        Y.append(0)
    else:
        Y.append(second_last_week_death_counts[second_last_week_death_counts['Date'] == curr_date]['Case Count'].iloc[0])
Y = np.array(Y)

### Wald's Test
#### One sample test
Hypotheses:  
$H_0: \hat\mu = \mu_0$  
$H_1: \hat\mu \neq \mu_0$

Assuming the original distribution is Poission($\lambda$), we get  
$\hat\mu_{MLE} = \hat\lambda_{MLE} = \bar X$ and  
$\hat se(\hat\mu) = \sqrt{Var(\bar X)} = \sqrt{\frac{Var(X)}{n}} = \sqrt{\frac{\hat\lambda_{MLE}}{n}} = \sqrt{\frac{\bar X}{n}}$

In [27]:
n = len(X)

mu_hat = np.mean(X)
mu_0 = np.mean(Y)
sample_variance = np.mean(X)
se_hat = np.sqrt(sample_variance / n)

In [28]:
w = np.abs((mu_hat - mu_0) / se_hat)

In [29]:
w

1.3416407864998738

For $\alpha = 0.05$, $Z_{\alpha/2} = 1.96$  
Since the Wald's statistic $|w| \leq Z_{\alpha/2}$ we fail to reject the null hypothesis $H_0$ and conclude that $\hat\mu = \mu_0$

#### Two sample test
Hypotheses:  
$H_0: \mu_x = \mu_y$  
$H_1: \mu_x \neq \mu_y$

Similar to one sample test, assuming the original distributions are Poisson($\lambda_x$) and Poisson($\lambda_y$) distributed, we get  
$\hat \mu_x = \bar X$ , $\hat \mu_y = \bar Y$ and  
$\hat se(\hat \mu_x) = \sqrt{\frac{\bar X}{n}}$ , $\hat se(\hat \mu_y) = \sqrt{\frac{\bar Y}{m}}$

In [30]:
n = len(X)
m = len(Y)

mu_x = np.mean(X)
mu_y = np.mean(Y)
sample_variance_x = np.mean(X)
sample_variance_y = np.mean(Y)
se_hat = np.sqrt(sample_variance_x / n + sample_variance_y / m)

In [31]:
w = np.abs((mu_x - mu_y) / se_hat)

In [32]:
w

1.133893419027682

For $\alpha = 0.05$, $Z_{\alpha/2} = 1.96$  
Since the Wald's statistic $|w| \leq Z_{\alpha/2}$ we fail to reject the null hypothesis $H_0$ and conclude that $\mu_x = \mu_y$

### T Test
#### One sample test
Hypotheses:  
$H_0: \mu = \mu_0$  
$H_1: \mu \neq \mu_0$

In [33]:
n = len(X)

X_bar = np.mean(X)
mu_0 = np.mean(Y)
sample_variance = 1/n * np.sum(np.square(X - X_bar))
se_hat = np.sqrt(sample_variance / n)

In [34]:
t = np.abs((X_bar - mu_0) / se_hat)
t

1.6201851746019653

For $\alpha = 0.05$ and $df = 6$, $t_{df,\alpha/2} = 2.447$  
Since the T's statistic $|t| \leq t_{n-1, \alpha/2}$ we fail to reject the null hypothesis $H_0$ and conclude that $\mu = \mu_0$

### Two sample test (Paired)
Hypotheses:  
$H_0: \mu_x = \mu_y$  
$H_1: \mu_x \neq \mu_y$  
$D_i = X_i - Y_i$  

In [35]:
D = X - Y

In [36]:
n = len(D)

D_bar = np.mean(D)
sample_variance_d = 1/n * np.sum(np.square(D - D_bar))
se_hat = np.sqrt(sample_variance_d / n)

In [37]:
t = np.abs(D_bar / se_hat)
t

1.5566235649883124

For $\alpha = 0.05$ and $df = 6$, $t_{df,\alpha/2} = 2.447$  
Since the T's statistic $|t| \leq t_{n-1, \alpha/2}$ we fail to reject the null hypothesis $H_0$ and conclude that $\mu_x = \mu_y$

### Two sample test (Unpaired)
Hypotheses:  
$H_0: \mu_x = \mu_y$  
$H_1: \mu_x \neq \mu_y$  
$\bar D= \bar X - \bar Y$  

In [38]:
n = len(X)
m = len(Y)

X_bar = np.mean(X)
Y_bar = np.mean(Y)
sample_variance_x = 1/n * np.sum(np.square(X - X_bar))
sample_variance_y = 1/m * np.sum(np.square(Y - Y_bar))
se_hat = np.sqrt(sample_variance_x / n + sample_variance_y / m)

In [39]:
t = np.abs((X_bar - Y_bar) / se_hat)
t

1.3612278194595453

For $\alpha = 0.05$ and $df = n+m-2 = 12$, $t_{df,\alpha/2} = 2.179$  
Since the T's statistic $|t| \leq t_{n+m-2, \alpha/2}$ we fail to reject the null hypothesis $H_0$ and conclude that $\mu_x = \mu_y$