In [1]:
import numpy as np
import pandas as pd

import scipy
from statsmodels.stats.weightstats import *
from statsmodels.stats.proportion import proportion_confint

In [2]:
import scipy
import statsmodels

In [3]:
def proportions_diff_confint_ind(sample1, sample2, alpha = 0.05):    
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    
    return (left_boundary, right_boundary)

In [4]:
def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [5]:
sample1_3 = np.zeros(34) 
sample2_3 = np.zeros(16) 
for i in np.arange(10):
    sample1_3[i] += 1
for i in np.arange(4):
    sample2_3[i] += 1

In [6]:
z3 = proportions_diff_z_stat_ind(sample1_3, sample2_3)

In [7]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

In [9]:
ans3 = proportions_diff_z_test(z3, 'two-sided')

In [10]:
print(ans3)

0.7458609174504707


In [11]:
ans3_alt = proportions_diff_z_test(z3, 'greater')

In [12]:
print(ans3_alt)

0.37293045872523534


In [15]:
data_bank = pd.read_csv('banknotes.txt', sep = '\t')
data_bank.describe()
data_bank.head()
data_bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 7 columns):
X1      200 non-null float64
X2      200 non-null float64
X3      200 non-null float64
X4      200 non-null float64
X5      200 non-null float64
X6      200 non-null float64
real    200 non-null int64
dtypes: float64(6), int64(1)
memory usage: 11.0 KB


In [16]:
data_bank.columns

Index(['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'real'], dtype='object')

In [18]:
data_b1 = data_bank.drop(['X4', 'X5', 'X6'], axis = 1)
data_b2 = data_bank.drop(['X1', 'X2', 'X3'], axis = 1)

In [19]:
data_b1.columns

Index(['X1', 'X2', 'X3', 'real'], dtype='object')

In [20]:
X1 = data_b1.drop(['real'], axis = 1)
X2 = data_b2.drop(['real'], axis = 1)
y1 = data_b1['real']
y2 = data_b2['real']

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, random_state = 1, test_size = 50)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state = 1, test_size = 50)

In [25]:
from sklearn.linear_model import LogisticRegression

In [26]:
clf_1 = LogisticRegression()
clf_1.fit(X1_train, y1_train)
clf_2 = LogisticRegression()
clf_2.fit(X2_train, y2_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [41]:
ans_1 = clf_1.predict(X1_test)
ans_2 = clf_2.predict(X2_test)

In [58]:
score_1 = np.zeros(50)
score_2 = np.zeros(50)
for i in np.arange(50):
    if (ans_1[i] == y1_test.values[i]):
        score_1[i] = 1
    if (ans_2[i] == y2_test.values[i]):
        score_2[i] = 1

In [59]:
print(score_1)
print(score_2)

[1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1.
 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 1. 0. 0. 0.
 1. 1.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1.]


In [29]:
def proportions_diff_confint_rel(sample1, sample2, alpha = 0.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    sample = list(zip(sample1, sample2))
    n = len(sample)
        
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    left_boundary = float(f - g) / n  - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)

In [30]:
def proportions_diff_z_stat_rel(sample1, sample2):
    sample = list(zip(sample1, sample2))
    n = len(sample)
    
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    return float(f - g) / np.sqrt(f + g - float((f - g)**2) / n )

In [60]:
z4 = proportions_diff_z_stat_rel(score_1, score_2)

In [61]:
ans4 = proportions_diff_z_test(z4, 'two-sided')

In [62]:
print(ans4)

0.0032969384555543435


In [64]:
proportions_diff_confint_rel(score_2, score_1)

(0.059945206279614305, 0.3000547937203857)

In [67]:
std_dev6_0 = 100
mean6_0 = 525
n6_0 = 200000 
n6_1 = 100
mean6_1 = 541.4
alpha = 0.05

In [66]:
mean6_2 = 541.5

In [72]:
def z_test(mean_val, exp_val, st_dev, num):
    standard_error = st_dev / np.sqrt(num)
    return (mean_val - exp_val) / standard_error

In [74]:
z6_1 = z_test(mean6_1, mean6_0, std_dev6_0, n6_1)
z6_2 = z_test(mean6_2, mean6_0, std_dev6_0, n6_1)

In [75]:
p6_1 = 1 - scipy.stats.norm.cdf(z6_1)
p6_2 = 1 - scipy.stats.norm.cdf(z6_2)

In [76]:
p6_1

0.05050258347410397

In [77]:
p6_2

0.0494714680336481