## Доверительные интервалы (тест)

In [43]:
import pandas as pd
import numpy as np
from statsmodels.stats.weightstats import _zconfint_generic, _tconfint_generic
from sklearn import cross_validation, datasets, linear_model, metrics
from scipy import stats

In [9]:
data = pd.read_csv('water.txt', header=0, sep='\t')

In [10]:
data.head(3)

Unnamed: 0,location,town,mortality,hardness
0,South,Bath,1247,105
1,North,Birkenhead,1668,17
2,South,Birmingham,1466,5


#### Найдем доверительный интервалы по mortality

In [11]:
mean = data["mortality"].mean()
std = data["mortality"].std(ddof=1)
std_mean = std/np.sqrt(len(data))

In [12]:
_tconfint_generic(mean,std_mean, len(data)-1, 0.05, 'two-sided')

(1476.0833413552848, 1572.2117406119285)

#### Найдем доверительный интервалы по mortality только для южных городов

In [21]:
data_south = data[data["location"]=="South"]

In [22]:
data_south.head(3)

Unnamed: 0,location,town,mortality,hardness
0,South,Bath,1247,105
2,South,Birmingham,1466,5
7,South,Bournemouth,1299,78


In [23]:
mean_south = data_south["mortality"].mean()
std_south = data_south["mortality"].std(ddof=1)
std_mean_south = std_south/np.sqrt(len(data_south))

In [24]:
_tconfint_generic(mean_south,std_mean_south, len(data_south)-1, 0.05, 'two-sided')

(1320.1517462936238, 1433.463638321761)

#### Найдем доверительный интервалы по mortality только для северных городов

In [25]:
data_north = data[data["location"]=="North"]

In [26]:
data_north.head(3)

Unnamed: 0,location,town,mortality,hardness
1,North,Birkenhead,1668,17
3,North,Blackburn,1800,14
4,North,Blackpool,1609,18


In [27]:
mean_north = data_north["mortality"].mean()
std_north = data_north["mortality"].std(ddof=1)
std_mean_north = std_north/np.sqrt(len(data_north))

In [28]:
_tconfint_generic(mean_north,std_mean_north, len(data_north)-1, 0.05, 'two-sided')

(1586.5605251961385, 1680.6394748038613)

In [30]:
print "на Юге", mean_south, "на Севере", mean_north

на Юге 1376.80769231 на Севере 1633.6


#### Найдем доверительный интервалы по hardness отдельно для Юга и Севера

In [33]:
mean_south = data_south["hardness"].mean()
std_south = data_south["hardness"].std(ddof=1)
std_mean_south = std_south/np.sqrt(len(data_south))

mean_north = data_north["hardness"].mean()
std_north = data_north["hardness"].std(ddof=1)
std_mean_north = std_north/np.sqrt(len(data_north))

In [34]:
_tconfint_generic(mean_south,std_mean_south, len(data_south)-1, 0.05, 'two-sided')

(53.467198692036106, 86.071262846425441)

In [35]:
_tconfint_generic(mean_north,std_mean_north, len(data_north)-1, 0.05, 'two-sided')

(21.422487285724259, 39.377512714275738)

#### Определения размера выборки для стендартного отклонения в 1, урвонем доверия в 95% и ошибкой в +-0.1

In [42]:
np.ceil((stats.norm.ppf(1-0.05/2) / 0.1)**2)

385.0