In [34]:
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from scipy import stats

In [35]:
# read data
county = pd.read_csv('county.csv')
access = pd.read_csv('accessability.csv')
afford = pd.read_csv('affordability.csv')
tmf = pd.read_csv('TMF.csv')
data = pd.read_csv('demographic.csv', index_col=0)

# Data processing

In [36]:
# process accessibility data for each county, use 'mean' to aggregate.
county_access = pd.merge(left=county, right=access, how='left', left_on='PWSID', right_on='PWSID')
county_access = county_access.drop(['PWSID', 'Water System Name_x', 'Water System Name_y'], axis=1)
county_access = county_access.groupby('County').mean()

# process affordability data for each county, use 'mean' to aggregate.
county_afford = pd.merge(left=county, right=afford, how='left', left_on='PWSID', right_on='PWSID')
county_afford = county_afford.drop(['PWSID', 'Water System Name_x', 'Water System Name_y'], axis=1)
county_afford = county_afford[county_afford['Drinking Water Charges'].notna()]
county_afford = county_afford.groupby('County').mean()

# process tmf data for each county, use 'mean' to aggregate.
county_tmf = pd.merge(left=county, right=tmf, how='left', left_on='PWSID', right_on='PWSID')
county_tmf = county_tmf.drop(['PWSID', 'Water System Name_x', 'Water System Name_y'], axis=1)
county_tmf = county_tmf.groupby('County').mean()

In [37]:
y = data.iloc[:, 0]

X = data.iloc[:, 1:]
# Concat protected features and non-sensitive features
data = pd.concat([X, county_access, county_afford, county_tmf], axis=1)

# Linear Regression

In [38]:
reg = LinearRegression().fit(data, y)

X2 = sm.add_constant(data)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

                                     OLS Regression Results                                     
Dep. Variable:     AVERAGE of Total Weighted Risk Score   R-squared:                       0.463
Model:                                              OLS   Adj. R-squared:                  0.335
Method:                                   Least Squares   F-statistic:                     3.613
Date:                                  Tue, 03 May 2022   Prob (F-statistic):            0.00101
Time:                                          11:45:14   Log-Likelihood:                 31.910
No. Observations:                                    58   AIC:                            -39.82
Df Residuals:                                        46   BIC:                            -15.09
Df Model:                                            11                                         
Covariance Type:                              nonrobust                                         
                              

# T-test

In [39]:
def t_test(col):
    white_more_county_risk = data[data[col] >= data[col].mean()]['AVERAGE of Total Weighted Risk Score']
    white_less_county_risk = data[data[col] < data[col].mean()]['AVERAGE of Total Weighted Risk Score']

    mean_risk_white_more = sum(white_more_county_risk) / len(white_more_county_risk)
    mean_risk_white_less = sum(white_less_county_risk) / len(white_less_county_risk)

    print('mean values are:', mean_risk_white_more, mean_risk_white_less)

    p_value_adult = stats.ttest_ind(white_less_county_risk, white_more_county_risk, equal_var=False)[1]
    print('p-value is', p_value_adult)

data = pd.concat([data, y], axis=1)
columns = list(data.columns)
for col in columns[:7]:
    print('For ' + col)
    t_test(col)
    print()

For Male
mean values are: 0.7371428571428572 0.7967567567567567
p-value is 0.24397108701545372

For White
mean values are: 0.7255882352941176 0.8454166666666666
p-value is 0.03301880295705186

For Black or African American
mean values are: 0.7933333333333334 0.7688372093023257
p-value is 0.7005702704383296

For American Indian and Alaska Native
mean values are: 0.7361111111111112 0.7927500000000002
p-value is 0.23288957307985603

For Asian
mean values are: 0.7794444444444445 0.7732500000000002
p-value is 0.9170218315609615

For Native Hawaiian and Other Pacific Islander
mean values are: 0.7985 0.7628947368421054
p-value is 0.4915122637485214

For Some other race
mean values are: 0.8204 0.7409090909090909
p-value is 0.14311641759906482

