In [1]:
from scipy.stats import uniform, norm, chi2, poisson, binom
import math

In [2]:
def get_intervals(xi_i, N, name, frequencies):
    p_xi = xi_i / N
    print(f'Analyzing the {name} variable')
    print()
    print(f'Frequency is {p_xi}')
    print()
    for alpha in [0.9, 0.95, 0.975]:
        print(f'For alpha={alpha}')
        dist_xi = norm.interval(alpha, loc=xi_i, scale=math.sqrt(xi_i*(1-p_xi)))
        print(f'Interval is {dist_xi}')
        dist_xi = (dist_xi[0] / N, dist_xi[1] / N)
        print(f'Normalized interval is {dist_xi}')
        for n, f in frequencies.items():
            print(f'{n} frequency is {"" if dist_xi[0] < f < dist_xi[1] else "not "}in interval')
        print()
        
def get_frequencies(xi_i, N, eta_i, K):
    p_xi = xi_i / N
    p_eta = eta_i / K
    frequencies = {'average': (p_eta + p_xi) / 2, 'weighted': p_eta * K / (K + N) + p_xi * N / (K + N),
                  'minimum': min(p_xi, p_eta), 'geom': math.sqrt(p_xi * p_eta)}
    print(f'Average frequency: {frequencies["average"]}')
    print(f'Weighted frequency: {frequencies["weighted"]}')
    print(f'Minimum frequency: {frequencies["minimum"]}')
    print(f'Geom frequency: {frequencies["geom"]}')
    return frequencies 

In [3]:
# i2: TCC + A merge
xi_i = 471
N = 1544+845+471+419
eta_i = 3435
K = 24292

frequencies = get_frequencies(xi_i, N, eta_i, K)
print('================================')
get_intervals(xi_i, N, 'xi', frequencies)
print('================================')
get_intervals(eta_i, K, 'eta', frequencies)

Average frequency: 0.142522965855046
Weighted frequency: 0.14167059591599868
Minimum frequency: 0.1414045776387288
Geom frequency: 0.14251857775011723
Analyzing the xi variable

Frequency is 0.14364135407136322

For alpha=0.9
Interval is (437.9656661322124, 504.0343338677876)
Normalized interval is (0.13356683932058933, 0.15371586882213711)
average frequency is in interval
weighted frequency is in interval
minimum frequency is in interval
geom frequency is in interval

For alpha=0.95
Interval is (431.63716334800307, 510.36283665199693)
Normalized interval is (0.13163682932235532, 0.15564587882037112)
average frequency is in interval
weighted frequency is in interval
minimum frequency is in interval
geom frequency is in interval

For alpha=0.975
Interval is (425.98490271556784, 516.0150972844322)
Normalized interval is (0.12991305358815733, 0.1573696545545691)
average frequency is in interval
weighted frequency is in interval
minimum frequency is in interval
geom frequency is in interva

In [4]:
# i3: C(13.7%) + TCCA(14.14%) merge
# TCCA is a created hedge, so calculating the K (freq, count) as K = count / freq
# TCCA: frequency=14.14045776387288, weight=471

p_xi = 13.742456172885515 / 100
N = 1559
xi_i = N * p_xi

eta_i = 471
p_eta = 14.14045776387288 / 100
K = eta_i / p_eta

frequencies = get_frequencies(xi_i, N, eta_i, K)
print('================================')
get_intervals(xi_i, N, 'xi', frequencies)
print('================================')
get_intervals(eta_i, K, 'eta', frequencies)

Average frequency: 0.13941456968379196
Weighted frequency: 0.14013565899954644
Minimum frequency: 0.13742456172885514
Geom frequency: 0.1394003662421881
Analyzing the xi variable

Frequency is 0.13742456172885514

For alpha=0.9
Interval is (191.88441604133104, 236.6053674292393)
Normalized interval is (0.12308172934017385, 0.15176739411753642)
average frequency is in interval
weighted frequency is in interval
minimum frequency is in interval
geom frequency is in interval

For alpha=0.95
Interval is (187.60074156943801, 240.88904190113232)
Normalized interval is (0.12033402281554716, 0.15451510064216312)
average frequency is in interval
weighted frequency is in interval
minimum frequency is in interval
geom frequency is in interval

For alpha=0.975
Interval is (183.7748058931414, 244.71497757742895)
Normalized interval is (0.11787992680765966, 0.15696919665005066)
average frequency is in interval
weighted frequency is in interval
minimum frequency is in interval
geom frequency is in int

In [5]:
# difference for 1.5% (2000) and 2.5% (10000)
xi_i = 20
N = 2000
eta_i = 250
K = 10000

frequencies = get_frequencies(xi_i, N, eta_i, K)
print('================================')
get_intervals(xi_i, N, 'xi', frequencies)
print('================================')
get_intervals(eta_i, K, 'eta', frequencies)

Average frequency: 0.0175
Weighted frequency: 0.0225
Minimum frequency: 0.01
Geom frequency: 0.015811388300841896
Analyzing the xi variable

Frequency is 0.01

For alpha=0.9
Interval is (12.680863412185204, 27.319136587814793)
Normalized interval is (0.006340431706092602, 0.013659568293907397)
average frequency is not in interval
weighted frequency is not in interval
minimum frequency is in interval
geom frequency is not in interval

For alpha=0.95
Interval is (11.278710837855238, 28.721289162144764)
Normalized interval is (0.005639355418927619, 0.014360644581072382)
average frequency is not in interval
weighted frequency is not in interval
minimum frequency is in interval
geom frequency is not in interval

For alpha=0.975
Interval is (10.026387489538468, 29.973612510461543)
Normalized interval is (0.005013193744769234, 0.01498680625523077)
average frequency is not in interval
weighted frequency is not in interval
minimum frequency is in interval
geom frequency is not in interval

Anal

In [6]:
# difference for 2% (10000) and 2.5% (10000)
xi_i = 200
N = 10000
eta_i = 250
K = 10000

frequencies = get_frequencies(xi_i, N, eta_i, K)
print('================================')
get_intervals(xi_i, N, 'xi', frequencies)
print('================================')
get_intervals(eta_i, K, 'eta', frequencies)

Average frequency: 0.0225
Weighted frequency: 0.0225
Minimum frequency: 0.02
Geom frequency: 0.022360679774997897
Analyzing the xi variable

Frequency is 0.02

For alpha=0.9
Interval is (176.97204922267937, 223.0279507773206)
Normalized interval is (0.017697204922267937, 0.02230279507773206)
average frequency is not in interval
weighted frequency is not in interval
minimum frequency is in interval
geom frequency is not in interval

For alpha=0.95
Interval is (172.56050421643926, 227.43949578356074)
Normalized interval is (0.017256050421643927, 0.022743949578356074)
average frequency is in interval
weighted frequency is in interval
minimum frequency is in interval
geom frequency is in interval

For alpha=0.975
Interval is (168.62036181353076, 231.37963818646926)
Normalized interval is (0.016862036181353075, 0.023137963818646926)
average frequency is in interval
weighted frequency is in interval
minimum frequency is in interval
geom frequency is in interval

Analyzing the eta variable

F

In [7]:
# difference for 80% (2000) and 80.5% (10000)
xi_i = 1600
N = 2000
eta_i = 8050
K = 10000

frequencies = get_frequencies(xi_i, N, eta_i, K)
print('================================')
get_intervals(xi_i, N, 'xi', frequencies)
print('================================')
get_intervals(eta_i, K, 'eta', frequencies)

Average frequency: 0.8025
Weighted frequency: 0.8041666666666667
Minimum frequency: 0.8
Geom frequency: 0.8024961059095552
Analyzing the xi variable

Frequency is 0.8

For alpha=0.9
Interval is (1570.5759638167954, 1629.4240361832046)
Normalized interval is (0.7852879819083977, 0.8147120180916023)
average frequency is in interval
weighted frequency is in interval
minimum frequency is in interval
geom frequency is in interval

For alpha=0.95
Interval is (1564.9390983769367, 1635.0609016230633)
Normalized interval is (0.7824695491884683, 0.8175304508115316)
average frequency is in interval
weighted frequency is in interval
minimum frequency is in interval
geom frequency is in interval

For alpha=0.975
Interval is (1559.9045690889752, 1640.0954309110248)
Normalized interval is (0.7799522845444876, 0.8200477154555124)
average frequency is in interval
weighted frequency is in interval
minimum frequency is in interval
geom frequency is in interval

Analyzing the eta variable

Frequency is 0.