In [1]:
import numpy as np
import pandas as pd

import scipy
from statsmodels.stats.weightstats import *
from statsmodels.stats.proportion import proportion_confint

In [3]:
def proportions_diff_z_stat_ind(n1, n2, s1,s2): #вычисление значения z-статистики
    p1 = float(s1) / n1
    p2 = float(s2) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [5]:
z = proportions_diff_z_stat_ind(34, 16, 10,4)
z

0.32410186177608225

In [7]:
import scipy.stats as sts
norm_rv = sts.norm(0,1)
2*(1-norm_rv.cdf(abs(z)))

0.7458609174504707

In [8]:
df = pd.read_csv('banknotes.csv', sep='\t')
df.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,real
0,214.8,131.0,131.1,9.0,9.7,141.0,1
1,214.6,129.7,129.7,8.1,9.5,141.7,1
2,214.8,129.7,129.7,8.7,9.6,142.2,1
3,214.8,129.7,129.6,7.5,10.4,142.0,1
4,215.0,129.6,129.7,10.4,7.7,141.8,1


In [10]:
from sklearn.model_selection import train_test_split
y = df['real']
x = df.drop(['real'], axis=1)

In [11]:
X_train, X_test, y_train, y_test =train_test_split(x, y, test_size=0.25, random_state=42)

In [13]:
X1_train = X_train.drop(['X4','X5','X6'],axis=1)
X2_train = X_train.drop(['X1','X2','X3'],axis=1)

In [14]:
X1_test = X_test.drop(['X4','X5','X6'],axis=1)
X2_test = X_test.drop(['X1','X2','X3'],axis=1)

In [18]:
from sklearn.linear_model import LogisticRegression
model_1 = LogisticRegression(random_state=1) #penalty - вид регуляризации, class_weight - балансировка классов
model_1.fit(X1_train,y_train)
ypred_1 = model_1.predict(X1_test)

In [19]:
from sklearn.linear_model import LogisticRegression
model_2 = LogisticRegression(random_state=1) #penalty - вид регуляризации, class_weight - балансировка классов
model_2.fit(X2_train,y_train)
ypred_2 = model_2.predict(X2_test)

In [21]:
from sklearn.metrics import accuracy_score #точность
score_1 = accuracy_score(y_test,ypred_1)
score_2 = accuracy_score(y_test,ypred_2)
print score_1
print score_2

0.8
0.96


In [22]:
samples_1 = list()
samples_2 = list()
for i in range(0,len(X1_test)):
    if ypred_1[i]!=y_test[i]:
        samples_1.append(1)
    else:
        samples_1.append(0)
        
    if ypred_2[i]!=y_test[i]:
        samples_2.append(1)
    else:
        samples_2.append(0)

KeyError: 0L

In [34]:
#выборка ошибок классификаторов
sample_1 = (ypred_1 != y_test).values.astype(int)
sample_2 = (ypred_2 != y_test).values.astype(int)

In [35]:
def proportions_diff_z_stat_ind(sample1, sample2): #вычисление значения z-статистики
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [38]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'): # вычисление уровня значимости
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

In [36]:
proportions_diff_z_stat_ind(sample_1, sample_2)

2.461829819586655

In [37]:
norm_rv = sts.norm(0,1)
2*(1-norm_rv.cdf(abs(z)))

0.7458609174504707

In [39]:
proportions_diff_z_test(z, alternative = 'two-sided')

0.7458609174504707

In [42]:
print float(sum(sample_1)) / len(sample_1)
print float(sum(sample_2)) / len(sample_2)

0.2
0.04


In [43]:
def proportions_confint_diff_ind(sample1, sample2, alpha = 0.05):    
    z = scipy.stats.norm.ppf(1 - alpha / 2.)   
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    
    return (left_boundary, right_boundary)
print "confidence interval: [%f, %f]" % proportions_confint_diff_ind(sample_1, sample_2)

confidence interval: [0.036538, 0.283462]


In [53]:
z = (541.4-525)/10
z

1.6399999999999977

In [54]:
norm_rv = sts.norm(0,1)
1-norm_rv.cdf(abs(z))

0.05050258347410397

In [55]:
z = (541.5-525)/10
z

1.65

In [56]:
norm_rv = sts.norm(0,1)
1-norm_rv.cdf(abs(z))

0.0494714680336481