In [1]:
path = '/Volumes/Seagate/ML/Machine-Learning-Implementations/ML'
import sys
sys.path.insert(1, path)
path = '/Volumes/Seagate/ML/Machine-Learning-Implementations'
import sys
sys.path.insert(1, path)

import pandas as pd
import numpy as np
import ML

In this notebook, we will test the hypothesis test functionality of the main code base, which includes:

1. Pearson correlation
2. Mutual Information
3. Kendall Information
4. chi-square test 
5. t-tests
6. kolmogorov-Smirnov test

Where appropriate we can use downloaded real datasets, otherwise opting for generated data.

In [2]:
# Pearson correlation

n_tests = 100
res = []
for _ in range(0,n_tests):
    x = np.random.uniform(0,1,(100))
    y = np.random.uniform(0,1,(100))
    z = np.random.uniform(0,1,(100))

    X = np.array([x,y,z])
    is_true = np.isclose(ML.pearson_correlation(X), np.corrcoef(X)).all()
    res.append(is_true)
np.array(res).all()

True

In [3]:
# Mutual information

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data'
source = pd.read_csv(url)
source.columns = ['target','f1','f2','f3','f4']
data = source.copy()

mapper = {x:i for i,x in enumerate(data['target'].unique())}
data['target'] = data['target'].map(lambda x:mapper[x])

X = data[['f1','f2','f3','f4']].values
Y = data['target'].values

my_result = ML.mutual_information().compute(X,Y).values

from sklearn.metrics import mutual_info_score
sklearn_result = np.array([mutual_info_score(Y, X[:,i]) for i in range(4)])

np.isclose(my_result, sklearn_result).all()

True

In [4]:
# Kendall information
from scipy.stats import kendalltau

def get(n):
    if n in [1,2]:
        x = np.random.choice([1,2,3],p=[2/3,1/6,1/6])
    elif n in [3,4]:
        x = np.random.choice([1,2,3],p=[1/6,2/3,1/6])
    else:
        x = np.random.choice([1,2,3],p=[1/6,1/6,2/3])
    return x

x = np.random.randint(1,7,100)
y = np.array([get(n) for n in iter(x)])


my_result = ML.kendall_correlation(x,y)
scipy_result = kendalltau(x,y).correlation

np.isclose(my_result , scipy_result)

True

In [5]:
# chi2 test

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data'
source = pd.read_csv(url)
source.columns = ['target','f1','f2','f3','f4']
data = source.copy()

mapper = {x:i for i,x in enumerate(data['target'].unique())}
data['target'] = data['target'].map(lambda x:mapper[x])

X = data[['f1','f2','f3','f4']].values
Y = data['target'].values

ML.chi2_test().compute(X,Y)


Unnamed: 0,f0,f1,f2,f3
chi_2 statistic,109.715,109.715,109.715,109.715
dof,8,8,8,8
p-value,0,0,0,0
Reject H_0?,True,True,True,True


In [6]:
# t-tests for the mean
from scipy import stats

rvs1 = stats.norm.rvs(loc=5,scale=10,size=300)
rvs2 = stats.norm.rvs(loc=5,scale=10,size=500)

stat, p = ML.TTest().one_sample(rvs1, 50)
res = stats.ttest_1samp(rvs1,50)
print(stat == res.statistic and p == res.pvalue)

stat, p = ML.TTest().one_sample(rvs1, 5)
res = stats.ttest_1samp(rvs1,5)
print(stat == res.statistic and p == res.pvalue)

stat, p = ML.TTest().two_sample(rvs1, rvs2, True)
res = stats.ttest_ind(rvs1,rvs2, equal_var = True)
print(np.isclose(stat,res.statistic) and np.isclose(p,res.pvalue))

stat, p = ML.TTest().two_sample(rvs1, rvs2, False)
res = stats.ttest_ind(rvs1,rvs2, equal_var = False)
print(np.isclose(stat,res.statistic) and np.isclose(p,res.pvalue))

True
True
True
True


In [7]:
# Kolmogorov-Smirnov test

# KS smaple test

rvs = np.random.normal(0,1,100)
KS_stat, reject = ML.KolmogorobSmirnov().one_sample(rvs, 'norm')

print(KS_stat)
print(reject)

rvs = np.random.uniform(0,1,100)
KS_stat, reject = ML.KolmogorobSmirnov().one_sample(rvs, 'norm')

print(KS_stat)
print(reject)

rvs1 = np.random.normal(0,1,100)
rvs2 = np.random.normal(0,1,100)
KS_stat, reject = ML.KolmogorobSmirnov().two_sample(rvs1, rvs2)

print(KS_stat)
print(reject)

0.5983321412415565
False
5.0132710800104245
True
0.08000000000000002
False
