# Calculate the minimum sample size

In [4]:
import pandas as pd
import math

## Parameters

In [23]:
## population size
N = 1000

## confidence interval
ci = 0.99

## margin of error
me = 0.05         # 1 - ci

## standard deviation (default value of 0.5 (50%) because not evaluable beforehand)
## representing the percentage (worst case scenario) that the calculated sample 
## size is big enough in terms of confidence interval
p = 0.5

## Evaluate z score

In [24]:
## percentile (needed to read z value from z score table)
# area_left = "{:.3f}".format((1 + ci)/2)
area_left = (1 + ci)/2

## read z_score table to assess the z_score
df = pd.read_excel("zScore.xlsx", sheet_name="zscore_left") 
df

Unnamed: 0,Z,0,0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09
0,0.0,0.5,0.504,0.508,0.512,0.516,0.5199,0.5239,0.5279,0.5319,0.5359
1,0.1,0.5398,0.5438,0.5478,0.5517,0.5557,0.5596,0.5636,0.5675,0.5714,0.5753
2,0.2,0.5793,0.5832,0.5871,0.591,0.5948,0.5987,0.6026,0.6064,0.6103,0.6141
3,0.3,0.6179,0.6217,0.6255,0.6293,0.6331,0.6368,0.6406,0.6443,0.648,0.6517
4,0.4,0.6554,0.6591,0.6628,0.6664,0.67,0.6736,0.6772,0.6808,0.6844,0.6879
5,0.5,0.6915,0.695,0.6985,0.7019,0.7054,0.7088,0.7123,0.7157,0.719,0.7224
6,0.6,0.7257,0.7291,0.7324,0.7357,0.7389,0.7422,0.7454,0.7486,0.7517,0.7549
7,0.7,0.758,0.7611,0.7642,0.7673,0.7704,0.7734,0.7764,0.7794,0.7823,0.7852
8,0.8,0.7881,0.791,0.7939,0.7967,0.7995,0.8023,0.8051,0.8078,0.8106,0.8133
9,0.9,0.8159,0.8186,0.8212,0.8238,0.8264,0.8289,0.8315,0.834,0.8365,0.8389


In [25]:
## check in which row the value of interest is
for idx, row in df.iterrows(): 
    if (row[0] >= area_left):
        firstValNextLine = row[0]
        break
    ## assign row index and row
    row_index = idx
    z_row = row 
## assign last value in row
lastValInLine = z_row[0.09]


def closest(lst, K):
    """
    Find the most closest value to the target value.
    :param lst: list of floats, numbers to be checked for the closest number
    :param K: float, target value
    :return: float, the closest number to the target value
    """
    return lst[min(range(len(lst)), key = lambda i: abs(lst[i]-K))]


## in case area_left is between the last value of the line and 
## the first value of the next line
if area_left > lastValInLine and area_left < firstValNextLine:
    
    ## check which of these values is closer to the target value
    x = closest([lastValInLine, firstValNextLine], area_left)
    
    ## assign row and column values (headers) for later calculation
    if x == lastValInLine:        
        z_rowVal = df.iloc[row_index]["Z"]        
        z_colVal = 0.09   
    else:
        z_rowVal = df.iloc[row_index+1]["Z"]  
        z_colVal = 0
        
else:       
    ## assign row and column values (headers) for later calculation
    z_rowVal = df.iloc[row_index]["Z"]           
    z_col = closest(list(z_row), area_left)
    z_colVal = z_row[z_row == z_col].index[0]
        
# z_rowVal, z_colVal

## sum up row and column values to result in z-score
z = z_rowVal + z_colVal
z

2.57

## Sample Size Calculation
### Standard Formula

In [26]:
sample_size = ( ((z**2 * p*(1-p))/(me**2)) / 
               (1 + ((z**2 * p*(1-p))/(me**2 * N))) )
math.ceil(sample_size), sample_size

(398, 397.7681286849062)

### Formula for Unknown or Hugh Populations

In [27]:
sample_size = (z**2 * p*(1-p)) / me**2
math.ceil(sample_size), sample_size

(661, 660.4899999999998)

### Slovin's Formula
Most inaccurate formula to assess sample size. Used when no knowledge about the population's behavior is availabe.

In [28]:
sample_size = N / (1 + (N * me**2))
math.ceil(sample_size), sample_size

(286, 285.71428571428567)