**inferential statistics**

In [2]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.stats.proportion import proportions_ztest

In [4]:
# Read CSV
# attorneyMerged = pd.read_csv("./clean_data/attorneyMerged.csv")
# clientMerged = pd.read_csv("./clean_data/clientMerged.csv")
# clientQuestion = pd.read_csv("./clean_data/clientQuestion.csv")
questions = pd.read_csv("./data/questions.csv")

print(questions["Category"].value_counts())

Family and Children                  88753
Other                                37053
Housing and Homelessness             34755
Consumer Financial Questions         17273
Work, Employment and Unemployment    10187
Individual Rights                     8312
Health and Disability                 2707
Income Maintenance                    2688
Education                              794
Juvenile                               357
Name: Category, dtype: int64


In [6]:
# show the head of tables

# H0: Covid-19 period's proportion of health related qustions is equal to other time period's proportion of health related questions
# Ha: Covid-19 period's proportion of health related qustions is larger than other time period's proportion of health related questions


covid19Periodquestions = questions[(questions['AskedOnUtc'] >= '2020-01-20 00:00:00') & (questions['AskedOnUtc'] <= '2022-01-25 07:43:43')]
otherQuestions = questions[(questions['AskedOnUtc'] < '2020-01-20 00:00:00') | (questions['AskedOnUtc'] > '2022-01-25 07:43:43')]

# n_questions_covid19Period = 100749, sample size
n_questions_covid19Period = int(covid19Periodquestions.shape[0])

# n_questions_other = 102130, sample size
n_questions_other = int(otherQuestions.shape[0])

# x_question_health_covid_19 = 1415, sample success
x_question_health_covid_19 = int(covid19Periodquestions[(covid19Periodquestions['Category'] == "Health and Disability")].shape[0])

# x_question_health_other = 1292, sample success
x_question_health_other = int(otherQuestions[(otherQuestions['Category'] == "Health and Disability")].shape[0])


# print(covid19Periodquestions.shape[0])
# print(otherQuestions.shape[0])
print(n_questions_covid19Period)
print(n_questions_other)
print(x_question_health_covid_19)
print(x_question_health_other)


# perform 
sample_success_covid19, sample_size_covid19 = (x_question_health_covid_19, n_questions_covid19Period)
sample_success_other, sample_size_other = (x_question_health_other, n_questions_other)
successes = np.array([sample_success_covid19, sample_success_other])
samples = np.array([sample_size_covid19, sample_size_other])

stat, p_value = proportions_ztest(count=successes, nobs=samples,  alternative='larger')
print(p_value)

# Since our p_value is 0.003 which is smaller than the significant level 0.05,
# we could reject the null hypothesis and conclude that the proportions of health-related questions
# in covid-19 period is larger than the proportions of health-related questions in other time periods.

100749
102130
1415
1292
0.0031037347765141407


In [8]:
# x_questions_unemployment_covid_19 = 6141, sample success
x_question_unemployment_covid_19 = int(covid19Periodquestions[(covid19Periodquestions['Category'] == "Work, Employment and Unemployment")].shape[0])

# x_questions_unemployment_other = 4046, sample success
x_question_unemployment_other = int(otherQuestions[(otherQuestions['Category'] == "Work, Employment and Unemployment")].shape[0])


print(x_question_unemployment_covid_19)
print(x_question_unemployment_other)


sample_success_unemployment, sample_size_unemployment = (x_question_unemployment_covid_19, n_questions_covid19Period)
sample_success_other, sample_size_other = (x_question_unemployment_other, n_questions_other)
successes = np.array([sample_success_unemployment, sample_success_other])
samples = np.array([sample_size_unemployment, sample_size_other])

stat, p_value = proportions_ztest(count=successes, nobs=samples,  alternative='larger')
print(p_value)

# Conclusion: We found that the p-value is smaller than the significant level 0.05, 
# thus we reject the null hypothesis and conclude that the proportions of work related 
# questions during covid-19 period is much larger than other days.

6141
4046
1.3211870506632954e-107
