## Lib imports


In [0]:
import numpy as np
import pandas as pd

from sklearn import datasets, linear_model, metrics
from sklearn.model_selection import cross_validate, train_test_split, cross_val_score

from statsmodels.stats.weightstats import _tconfint_generic

In [0]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


## Data import

In [0]:
water_data = pd.read_csv('https://raw.githubusercontent.com/OzmundSedler/100-Days-Of-ML-Code/master/week_10/datasets/water.txt',
                       sep='\t', skipinitialspace=True, low_memory=False)

In [0]:
water_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 4 columns):
location     61 non-null object
town         61 non-null object
mortality    61 non-null int64
hardness     61 non-null int64
dtypes: int64(2), object(2)
memory usage: 2.0+ KB


In [0]:
water_data.describe()

Unnamed: 0,mortality,hardness
count,61.0,61.0
mean,1524.147541,47.180328
std,187.668754,38.093966
min,1096.0,5.0
25%,1379.0,14.0
50%,1555.0,39.0
75%,1668.0,75.0
max,1987.0,138.0


In [0]:
water_data.head()

Unnamed: 0,location,town,mortality,hardness
0,South,Bath,1247,105
1,North,Birkenhead,1668,17
2,South,Birmingham,1466,5
3,North,Blackburn,1800,14
4,North,Blackpool,1609,18


## Task 1


Calculate 95% confidence interval for the mean mortality.

In [0]:
mortality_mean = water_data['mortality'].mean()

In [0]:
print(mortality_mean)

1524.1475409836066


In [0]:
mortality_mean_std = water_data['mortality'].std() / np.sqrt(water_data['mortality'].shape[0])

In [0]:
mortality_interval = _tconfint_generic(
                          mortality_mean,
                          mortality_mean_std,
                          water_data['mortality'].shape[0] - 1,
                          0.05,
                          'two-sided',
                      )

In [0]:
print(f'Mortality 95%% interval: {mortality_interval}')

Mortality 95%% interval: (1476.0833413552848, 1572.2117406119285)


## Task 2


Calculate 95% confidence interval for the South cities mean mortality.

In [0]:
south_water_data = water_data[water_data.location == 'South']

In [0]:
south_mortality_mean = south_water_data['mortality'].mean()

In [0]:
print(south_mortality_mean)

1376.8076923076924


In [0]:
south_mortality_mean_std = south_water_data['mortality'].std() / np.sqrt(south_water_data['mortality'].shape[0])

In [0]:
south_mortality_interval = _tconfint_generic(
                               south_mortality_mean,
                               south_mortality_mean_std,
                               south_water_data['mortality'].shape[0] - 1,
                               0.05,
                               'two-sided',
                           )

In [0]:
print(f'South mortality 95%% interval: {south_mortality_interval}')

South mortality 95%% interval: (1320.1517462936238, 1433.463638321761)


## Task 3

Now for the North. Do they intersect ? Make the conclusion.

In [0]:
north_water_data = water_data[water_data.location == 'North']

In [0]:
north_mortality_mean = north_water_data['mortality'].mean()

In [0]:
print(north_mortality_mean)

1633.6


In [0]:
north_mortality_mean_std = south_water_data['mortality'].std() / np.sqrt(south_water_data['mortality'].shape[0])

In [0]:
north_mortality_interval = _tconfint_generic(
                               north_mortality_mean,
                               north_mortality_mean_std,
                               north_water_data['mortality'].shape[0] - 1,
                               0.05,
                               'two-sided',
                           )

In [0]:
print(f'North mortality 95%% interval: {north_mortality_interval}')

North mortality 95%% interval: (1577.6948871516054, 1689.5051128483944)


## Task 4

Do the confidence intervals for water hardness (South/North) intersects?

In [0]:
south_mean_hardness = south_water_data['hardness'].mean()
north_mean_hardness = north_water_data['hardness'].mean()

In [0]:
print('South mean hardness: %f' % south_mean_hardness)
print('North mean hardness: %f' % north_mean_hardness)

South mean hardness: 69.769231
North mean hardness: 30.400000


In [0]:
south_mean_hardness_std = south_water_data['hardness'].std() / np.sqrt(south_water_data['hardness'].shape[0])
north_mean_hardness_std = north_water_data['hardness'].std() / np.sqrt(north_water_data['hardness'].shape[0])

In [0]:
print('Hardness north 95%% interval: %s' %  str(_tconfint_generic(north_mean_hardness, north_mean_hardness_std,
                                                                   north_water_data['hardness'].shape[0] - 1,
                                                                   0.05, 'two-sided')))
print('Hardness south 95%% interval: %s' %  str(_tconfint_generic(south_mean_hardness, south_mean_hardness_std,
                                                                   south_water_data['hardness'].shape[0] - 1,
                                                                   0.05, 'two-sided')))

Hardness north 95% interval: (21.42248728572426, 39.37751271427574)
Hardness south 95% interval: (53.467198692036106, 86.07126284642544)
