## Education and Performance

In [232]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [40]:
data = pd.read_csv('data/CitieSHealth_BCN_DATA_PanelStudy_20220414_Clean.csv')
data

In [233]:
sns.histplot(x = 'performance', hue = 'education', data = data)

plt.savefig('outputs/histogram_for_performance')

In [234]:
sns.histplot(x = 'occurrence_mental', hue = 'education', data = data)

plt.savefig('outputs/histogram_for_occurrence_mental')

In [235]:
sns.histplot(x = 'stress', hue = 'education', data = data)

plt.savefig('outputs/histogram_for_stress')

In [236]:
sns.barplot(x = 'education', y = 'stress', data = data)

plt.savefig('outputs/barplot_for_stressVsEducation')

In [237]:
sns.barplot(x = 'education', y = 'occurrence_mental', data = data)

plt.savefig('outputs/barplot_for_occurrenceVsEducation')

In [238]:
sns.barplot(x = 'education', y = 'performance', data = data)

plt.savefig('outputs/barplot_for_performanceVsEducation')

In [239]:
sns.scatterplot(x = 'stress', y = 'performance', hue = 'education', data = data)

plt.savefig('outputs/scatterplot_for_stressVsperformance')

In [240]:
sns.scatterplot(x = 'occurrence_mental', y = 'performance', hue = 'education', data = data)

plt.savefig('outputs/scatterplot_for_occurrenceVsperformance')

In [220]:
average_mental_health_issue_occurence_univ = np.mean(data[data['education'] == 'university']['occurrence_mental'])
average_mental_health_issue_occurence_bac = np.mean(data[data['education'] == 'baccalaureate']['occurrence_mental'])
average_mental_health_issue_occurence_pri_less = np.mean(data[data['education'] == 'primary or less']['occurrence_mental'])

print('Average Mental Health Issue Occurence for people with University education is {:0.3f}'.format(average_mental_health_issue_occurence_univ))
print('Average Mental Health Issue Occurence for people with Baccalaureate education is {:0.3f}'.format(average_mental_health_issue_occurence_bac))
print('Average Mental Health Issue Occurence for people with Primary or less education is {:0.3f}'.format(average_mental_health_issue_occurence_pri_less))

In [221]:
average_stress_univ = np.mean(data[data['education'] == 'university']['stress'])
average_stress_bac = np.mean(data[data['education'] == 'baccalaureate']['stress'])
average_stress_pri_less = np.mean(data[data['education'] == 'primary or less']['stress'])

print('Average Stress Level for people with University education is {:0.3f}'.format(average_stress_univ))
print('Average Stress Level for people with Baccalaureate education is {:0.3f}'.format(average_stress_bac))
print('Average Stress Level for people with Primary or less education is {:0.3f}'.format(average_stress_pri_less))

In [222]:
average_performance_univ = np.mean(data[data['education'] == 'university']['performance'])
average_performance_bac = np.mean(data[data['education'] == 'baccalaureate']['performance'])
average_performance_pri_less = np.mean(data[data['education'] == 'primary or less']['performance'])

print('Average Stroop Test score for people with University education is {:0.3f}'.format(average_performance_univ))
print('Average Stroop Test score for people with Baccalaureate education is {:0.3f}'.format(average_performance_bac))
print('Average Stroop Test score for people with Primary or less education is {:0.3f}'.format(average_performance_pri_less))

In [210]:
def hypothesis_testing(educ1, educ2, column, data, observed_diff, cutoff):
    diffs = []
    for i in np.arange(20000):
        data['shuffled_education'] = data['education'].sample(frac = 1, replace = False).values
        diff = np.abs(np.mean(data[data['shuffled_education'] == educ1][column]) - np.mean(data[data['shuffled_education'] == educ2][column]))
        diffs += [diff]
        data = data.drop(['shuffled_education'], axis = 1)
        
    pvalue = sum(observed_diff <= diffs) / len(diffs)
    if pvalue <= cutoff:
        print('P value is {:0.3f}'.format(pvalue) + '.' + ' ' + 'Thus, Reject the Null Hypothesis')
    else:
        print('P value is {:0.3f}'.format(pvalue) + '.' + ' ' + 'Thus, Fail to Rejct the Null Hypothesis')

In [211]:
observed_diff = np.abs(average_mental_health_issue_occurence_univ - average_mental_health_issue_occurence_bac)
hypothesis_testing('university', 'baccalaureate', 'occurrence_mental', data, observed_diff, 0.05)

In [212]:
observed_diff = np.abs(average_mental_health_issue_occurence_univ - average_mental_health_issue_occurence_pri_less)
hypothesis_testing('university', 'primary or less', 'occurrence_mental', data, observed_diff, 0.05)

In [213]:
observed_diff = np.abs(average_mental_health_issue_occurence_bac - average_mental_health_issue_occurence_pri_less)
hypothesis_testing('baccalaureate', 'primary or less', 'occurrence_mental', data, observed_diff, 0.05)

In [214]:
observed_diff = np.abs(average_stress_univ - average_stress_bac)
hypothesis_testing('university', 'baccalaureate', 'stress', data, observed_diff, 0.05)

In [215]:
observed_diff = np.abs(average_stress_univ - average_stress_pri_less)
hypothesis_testing('university', 'primary or less', 'stress', data, observed_diff, 0.05)

In [216]:
observed_diff = np.abs(average_stress_pri_less - average_stress_bac)
hypothesis_testing('primary or less', 'baccalaureate', 'stress', data, observed_diff, 0.05)

In [217]:
observed_diff = np.abs(average_performance_univ - average_performance_bac)
hypothesis_testing('university', 'baccalaureate', 'performance', data, observed_diff, 0.05)

In [218]:
observed_diff = np.abs(average_performance_univ - average_performance_pri_less)
hypothesis_testing('university', 'primary or less', 'performance', data, observed_diff, 0.05)

In [219]:
observed_diff = np.abs(average_performance_bac - average_performance_pri_less)
hypothesis_testing('baccalaureate', 'primary or less', 'performance', data, observed_diff, 0.05)