# Calculate the minimum sample size

In [5]:
import pandas as pd
import numpy as np
import math

## Parameters

In [33]:
## population size
N = 1000

## confidence interval
ci = 0.975

## margin of error
me = 0.05

## standard deviation (default value of 0.5 (50%) because not evaluable beforehand)
## representing the percentage (worst case scenario) that the calculated sample 
## size is big enough in terms of confidence interval
p = 0.5

In [34]:
## percentile (needed to read z value from z score table)
# area_left = "{:.3f}".format((1 + ci)/2)
area_left = (1 + ci)/2
area_left

0.9875

## Evaluate z score

In [8]:
## read z_score table to assess the z_score
df = pd.read_excel("zScore.xlsx", sheet_name="zscore_left")

## unpivot z-score table
zScoreTable = pd.melt(df, id_vars="Z")
zScoreTable.sort_values(by=["Z", "variable"], inplace=True)
zScoreTable["Z"] = np.arange(0, 3.5, 0.01)
zScoreTable.set_index(np.arange(0, 350), inplace=True)
zScoreTable.drop(["variable"], axis=1, inplace=True)
zScoreTable

Unnamed: 0,Z,value
0,0.00,0.5000
1,0.01,0.5040
2,0.02,0.5080
3,0.03,0.5120
4,0.04,0.5160
...,...,...
345,3.45,0.9997
346,3.46,0.9997
347,3.47,0.9997
348,3.48,0.9997


In [9]:
def evaluateClosestValue(lst, a):
    """
    Find the closest value to the target.
    :param lst: list, containing the closest values to the target
    :param a: float, target value for which z-score needs to be evaluated
    """
    ## in case target is exactly between two available values    
    x1 = lst[min(range(len(lst)), key=lambda i: abs(lst[i] - a))]
    x2 = lst[lst.index(x1) + 1]
    if np.round(abs(x1 - a), 6) == np.round(abs(x2 - a), 6):
        return x2
    else:
        return lst[min(range(len(lst)), key=lambda i: abs(lst[i] - a))]

In [46]:
def evaluateZScore(zScoreTable, areaVal):
    """
    Evaluate value with minimum distance to the desired value.
    """
    z_score = None    
    
    ## in case value is not available in zScoreTable, check for the closest one(s)
    lst = zScoreTable["value"]    
    closest1_val = lst[min(range(len(lst)), key=lambda i: abs(lst[i] - areaVal))]
    closest1_idx = zScoreTable[zScoreTable["value"] == closest1_val].index[0] 
    closest2_val = zScoreTable.iloc[closest1_idx+1]["value"]     
    closest_val = evaluateClosestValue([closest1_val, closest2_val], areaVal)

    ## evaluate z-score
    for idx, row in zScoreTable.iterrows():  
       
        ## in case an value is available in several cells,
        ## then take the one with the higher index        
        if row["value"] > closest_val:  
            print("zScore:", np.round(row["Z"], 2), "; value:", row["value"])
            break   
        
        z_score = np.round(row["Z"], 2)
    
    return z_score

In [None]:
z = evaluateZScore(zScoreTable, area_left)

## Sample Size Calculation
### Standard Formula

In [None]:
sample_size = ( ((z**2 * p*(1-p))/(me**2)) / 
               (1 + ((z**2 * p*(1-p))/(me**2 * N))) )
math.ceil(sample_size), sample_size

### Formula for Unknown or Hugh Populations

In [None]:
sample_size = (z**2 * p*(1-p)) / me**2
math.ceil(sample_size), sample_size

### Slovin's Formula
Most inaccurate formula to assess sample size. Used when no knowledge about the population's behavior is availabe.

In [None]:
sample_size = N / (1 + (N * me**2))
math.ceil(sample_size), sample_size