In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import t, ttest_ind_from_stats
from numpy.random import seed
medical = pd.read_csv('insurance2.csv')
print(medical.head())
print(medical.shape)

   age  sex     bmi  children  smoker  region      charges  insuranceclaim
0   19    0  27.900         0       1       3  16884.92400               1
1   18    1  33.770         1       0       2   1725.55230               1
2   28    1  33.000         3       0       2   4449.46200               0
3   33    1  22.705         0       0       1  21984.47061               0
4   32    1  28.880         0       0       1   3866.85520               1
(1338, 8)


In [5]:
#1 Plot the histogram of charges. Calculate mean and std
charg_mean = medical['charges'].mean()
charg_std = medical['charges'].std()
print('mean - ', charg_mean)
print('std - ', charg_std)
print(medical['charges'].describe())
print('Mean seems to be less appropriate to analyze costs than the median. Half of the data is less than $9382 while the mean is $13270 due to several large charges(need to investigate that they are not outliers). The wide range caused by the large charges is indicated by the large standard deviation of the charges. This large standard deviation needs to be understood.')

mean -  13270.422265141257
std -  12110.011236693994
count     1338.000000
mean     13270.422265
std      12110.011237
min       1121.873900
25%       4740.287150
50%       9382.033000
75%      16639.912515
max      63770.428010
Name: charges, dtype: float64
Mean seems to be less appropriate to analyze costs than the median. Half of the data is less than $9382 while the mean is $13270 due to several large charges(need to investigate that they are not outliers). The wide range caused by the large charges is indicated by the large standard deviation of the charges. This large standard deviation needs to be understood.


In [45]:
#2 They are concerns the actual average charge has fallen below $12000
print('We can use this sample to establish a 95% confidence interval for our population mean. With that confidence interval we can see if the actual mean has a chance of being under $12000.')
print('The decent sample size (1338 people) allows for the distribution to more closely approach a normal distribution under the Central Limit Theorem.')
print('We will utilize a t-score as we do not now the population standard deviation.')

We can use this sample to establish a 95% confidence interval for our population mean. With that confidence interval we can see if the actual mean has a chance of being under $12000.
The decent sample size (1338 people) allows for the distribution to more closely approach a normal distribution under the Central Limit Theorem.
We will utilize a t-score as we do not now the population standard deviation.


In [6]:
#3 Given the concern, what is the most appropriate interval? One-sided or two-sided?
print('We will go with one-sided due to the hospital being a business that at least needs to meet a certain mean for its operational model.')
marg_err = t.ppf(.975, 1337) * np.std(medical['charges'], ddof=1)/np.sqrt(1338)
lower_end = charg_mean - marg_err
print('Lower end - ', lower_end)
print('Concerns are unwarranted as the lower end of our confidence interval is greater than the minimum mean the hospital requires.')

We will go with one-sided due to the hospital being a business that at least needs to meet a certain mean for its operational model.
Lower end -  12620.954034192644
Concerns are unwarranted as the lower end of our confidence interval is greater than the minimum mean the hospital requires.


In [29]:
#4 Administrator now wants to know if those with insurance are actually charged a different amount to those without.
# State the null and alternative hypothesis here
print('Null Hypothesis - People with or without insurance are charged the same amount')
print('Alternative Hypothesis - Those with insurance are a different amount to those without')
insurance = medical.loc[medical['insuranceclaim'] == 1]
no_insurance = medical.loc[medical['insuranceclaim'] == 0]
ins_mean = insurance['charges'].mean()
ins_std = insurance['charges'].std()
ins_count = insurance['charges'].count()
no_ins_mean = no_insurance['charges'].mean()
no_ins_std = no_insurance['charges'].std()
no_ins_count = no_insurance['charges'].count()
print('Insurance: ', 'Mean-', ins_mean, 'STD-', ins_std, 'Count-', ins_count)
print('No Insurance: ', 'Mean-', no_ins_mean, 'STD-', no_ins_std, 'Count-', no_ins_count)
print('An assumption is being made in our formula that the variances are equal.')
print('I feel an assumption is being made that all patients went in with same condition and were treated the exact same way. Different conditions/treatments can increase or decrease the charge independently of the patience having insurance or not.')
print('Up front with these means, it appears those with insurance are being charged more. What needs to be taken into account is the sample size, which will be done in the pooled standard deviation of those with and without insurance.')

Null Hypothesis - Those with insurance are a different amount to those without
Alternative Hypothesis - People with or without insurance are charged the same amount
Insurance:  Mean- 16423.928276537663 STD- 14045.928418802127 Count- 783
No Insurance:  Mean- 8821.421892306294 STD- 6446.510126811736 Count- 555
An assumption is being made in our formula that the variances are equal.
I feel an assumption is being made that all patients went in with same condition and were treated the exact same way. Different conditions/treatments can increase or decrease the charge independently of the patience having insurance or not.
Up front with these means, it appears those with insurance are being charged more. What needs to be taken into account is the sample size, which will be done in the pooled standard deviation of those with and without insurance.


In [34]:
#5 Perform the hypothesis test manually and also using scipy.stats
print('manually:')
STDp1 = np.sqrt((((ins_count - 1)*ins_std**2) + ((no_ins_count - 1)*no_ins_std**2))/(ins_count + no_ins_count - 2))
t1 = (ins_mean - no_ins_mean)/(STDp1*np.sqrt((1/ins_count) + (1/no_ins_count)))
print(t1)
print('P-value < 0.001')
print('scipy.stats approach')
t2 = ttest_ind_from_stats(mean1=ins_mean, std1=ins_std, nobs1=ins_count, mean2=no_ins_mean, std2=no_ins_std, nobs2=no_ins_count)
print(t2)
print('P-value very low, so we reject the null hypothesis that people with or without insurance are charged the same.')

manually:
11.89329903087671
P-value < 0.001
scipy.stats approach
Ttest_indResult(statistic=11.893299030876712, pvalue=4.461230231620717e-31)
P-value very low, so we reject the null hypothesis that people with or without insurance are charged the same.


In [37]:
#6 Lets assume the variances were not equal. Perform the test. 
new_t = ttest_ind_from_stats(mean1=ins_mean, std1=ins_std, nobs1=ins_count, mean2=no_ins_mean, std2=no_ins_std, nobs2=no_ins_count, equal_var=False)
print(new_t)
print('Produces a worse pvalue when not assuming equal variances.')

Ttest_indResult(statistic=13.298031957975647, pvalue=1.1105103216309438e-37)
Produces a worse pvalue when not assuming equal variances.


In [44]:
#7 Why is it that you can't find a z-test from a sample in scipy.stats? 
print('Scipy.stats contains several statistical tests, but not one for z-test. This is due to z-scores assumption that you contain knowledge about the populations standard deviation and mean. We cant really test hypothesis about the populations mean from samples, as we have assumed we already have knowledge on the population under z-score.') 

Scipy.stats contains several statistical tests, but not one for z-test. This is due to z-scores assumption that you contain knowledge about the populations standard deviation and mean. We cant really test hypothesis about the populations mean from samples, as we have assumed we already have knowledge on the population under z-score.
