In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd


In [None]:
def check_correlation1(sample_size=10000, 
                      p_malnourished=0.1,
                      mm_bw_effect=250,
                      mm_cgf_effect=0.5):
    
    mom_malnourished = np.random.choice([1, 0], 
                                        size=sample_size, 
                                        p=[p_malnourished, 1-p_malnourished]).astype(bool)
    
    cgf_mean = 7
    cgf_sd = 2
    cgf_no_effect = np.random.normal(loc=cgf_mean, 
                                     scale=cgf_sd, 
                                     size=sample_size)
    
    cgf_effect = np.random.normal(loc=cgf_mean + mm_cgf_effect, 
                                  scale=cgf_sd, 
                                  size=sample_size)
    
    cgf = np.where(mom_malnourished, cgf_effect, cgf_no_effect)
    
    bw_mean = 3500
    
    bw_sd = 500
    
    bw_no_effect = np.random.normal(loc=bw_mean, 
                                    scale=bw_sd, 
                                    size=sample_size)
    bw_effect = np.random.normal(loc=bw_mean + mm_bw_effect, 
                                 scale=bw_sd, 
                                 size=sample_size)
    
    bw = np.where(mom_malnourished, bw_effect, bw_no_effect)
    
    df = pd.DataFrame({'cgf': cgf, 'bw': bw, 'mom_malnourished': mom_malnourished})
    
    print('Correlation: ', np.corrcoef(cgf, bw)[0][1])
    
    sns.jointplot(x='cgf', y='bw', alpha=0.05, data=df)
    
    plt.show()

In [None]:
check_correlation1()

In [None]:
def check_correlation2(sample_size=10000, 
                       p_malnourished=0.1,
                       mm_bw_effect=250):
    mom_malnourished = np.random.choice([1, 0], 
                                        size=sample_size, 
                                        p=[p_malnourished, 1-p_malnourished]).astype(bool)
    bw_mean = 3500
    bw_mean_with_effect = bw_mean + mm_bw_effect
    bw_var = 500**2
    cgf_mean = 10
    cgf_var = 2**2
    correlation = 0.5
    covariance = correlation * np.sqrt(bw_var) * np.sqrt(cgf_var)
    mu_no_effect = [bw_mean, cgf_mean]
    mu_effect = [bw_mean_with_effect, cgf_mean]
    sigma = [[bw_var, covariance], 
             [covariance, cgf_var]]
    no_effect = np.random.multivariate_normal(mu_no_effect, sigma, size=sample_size)
    effect = np.random.multivariate_normal(mu_effect, sigma, size=sample_size)    
    data = np.where(np.array([mom_malnourished, mom_malnourished]).T, effect, no_effect)
    data = pd.DataFrame(data, columns=['bw', 'cgf'])
    data['mm'] = mom_malnourished
    print('Correlation: ', np.corrcoef(data['cgf'], data['bw'])[0][1])
    sns.jointplot(x='cgf', y='bw', alpha=0.05, data=data)
    plt.show()
    return data

In [None]:
check_correlation2()

In [None]:
df.groupby('mm').describe().stack()