In [None]:
import ultimate_sleuthbuilder as usb

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
N = 32 # Sequence length
sample_size = 100 # Sample size
trials = 20 # Number of trials

In [None]:
# Generate sampling distribution of p-values
expecations = usb.get_expected_counts(N)

mean_p_values = []
for trial in range(trials):
    # Generate a sample of n random sequences of length N
    sequences = [''.join(np.random.choice(['0', '1'], N)) for _ in range(sample_size)]

    p_values = []
    for seq in sequences:
        p = usb.analyze_sequence(seq)['p_value']
        p_values.append(p)
    mean_p_values.append(np.mean(p_values))

mean_p_values = np.array(mean_p_values)
sample_df = pd.DataFrame(mean_p_values, columns=['Mean P-value'])

# Save to csv
sample_df.to_csv('data/mean_p_values.csv', index=False)

In [None]:
# Visualization

sns.histplot(sample_df, binrange=(0, 1), bins=40)
plt.title('Sampling Distribution of Mean P-values')
plt.xlabel('Mean P-value')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Calculate margin of error for various confidence levels
z_scores = {0.90 : 1.645, 0.95 : 1.960, 0.99 : 2.576, 0.999 : 3.291}
confidence_levels = z_scores.keys()
margin_of_errors = {level : [] for level in confidence_levels}
for level in confidence_levels:
    # Open the hd5 database at usb.get_db_path()
    db_path = usb.get_db_path()  # Get the path to the database

    with pd.HDFStore(db_path, mode='r') as store:  # Open the store in append mode
        key = usb.get_db_key('summary/p_value')
        summary_df = store[key]
        
        # Get standard deviation for sequences of length N
        std_dev = summary_df.loc[N, 'std_dev']
        margin_of_error = z_scores[level] * std_dev / np.sqrt(sample_size)
        margin_of_errors[level].append(margin_of_error)

print('Margin of Error for Various Confidence Levels')
for level in confidence_levels:
    print(f'{level} confidence level: {margin_of_errors[level]}')

In [None]:
# Generate sampling distribution of p-values (with parallel processing)