<b>Imports and Reading Dataset</b>

In [1]:
# copula_insurance_nogroup.py

import pandas as pd
from scipy.stats import norm
import numpy as np

filename = 'insurance.csv' 

# source: https://www.kaggle.com/datasets/teertha/ushealthinsurancedataset

# fields: age, sex, bmi, children, smoker, region, charges
# make sure fields do not contain commas, and no missing or wrong values
data = pd.read_csv(filename)
print(data.head(10))
print("\n")

   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520
5   31  female  25.740         0     no  southeast   3756.62160
6   46  female  33.440         1     no  southeast   8240.58960
7   37  female  27.740         3     no  northwest   7281.50560
8   37    male  29.830         2     no  northeast   6406.41070
9   60  female  25.840         0     no  northwest  28923.13692




Now using numerical fields only here: age, bmi, children, charges

In [2]:
# extract numerical columns only
# exercise: synthetize with dummy variables for sex, smoker, region
age = data.loc[:,"age"]
bmi = data.loc[:,"bmi"]
children = data.loc[:,"children"]
charges = data.loc[:,"charges"]

<b>Step 1: Compute correlation matrix on real data</b>

In [3]:
# need correlation matrix computed on real data, for Gaussian copula
r_data = np.stack((age, bmi, children, charges), axis = 0)
r_corr = np.corrcoef(r_data) 
print(r_corr)
print("\n")

[[1.         0.10927188 0.042469   0.29900819]
 [0.10927188 1.         0.0127589  0.19834097]
 [0.042469   0.0127589  1.         0.06799823]
 [0.29900819 0.19834097 0.06799823 1.        ]]




Also computing the means for each feature. Not needed here, but useful to see if they make sense and get an idea.

In [4]:
r_mu  = [np.mean(age), np.mean(bmi), np.mean(children), np.mean(charges)]
nobs_synth = len(age)
print("Mean: %5.2f %5.2f %5.2f %6.0f" % (r_mu[0],r_mu[1],r_mu[2],r_mu[3]))
print("Nobs: ",nobs_synth)
zero = [0, 0, 0, 0] 

Mean: 39.21 30.66  1.09  13270
Nobs:  1338


<b>Step 2: Multivariate Gaussian</b>
<p>
Generate multivariate Gaussian with zero mean and covariance equal to correlation matrix on real data. We generate <code>nobs_synth</code> observations. In this case, the same number as in the real data.

In [5]:
# this step to reconstruct correl structure in synth. data
gfg = np.random.multivariate_normal(zero, r_corr, nobs_synth) 
g_age = gfg[:,0]
g_bmi = gfg[:,1]
g_children = gfg[:,2]
g_charges = gfg[:,3]

<b>Step 3 and 4: From Gaussian to Uniform to Target Distrib</b>
<p>
From the correlated Gaussian with the target correlation structure, extract marginals (the features) and turn them into uniforms on [0, 1]. Then transforms the uniforms into the correct target distribution: the empirical distribution on the real data, for each feature. The correlation structure is preserved.

In [6]:
# this step to transform each Gaussian to target empirical distrib. in real data
# ... while preserving correl. structure 

s_data = []

for k in range(nobs_synth):  

    # first get uniform distrib. for each
    u_age = norm.cdf(g_age[k])
    u_bmi = norm.cdf(g_bmi[k])
    u_children = norm.cdf(g_children[k])
    u_charges = norm.cdf(g_charges[k])

    # turn uniform into target distrib.
    s_age = np.quantile(age, u_age)                # synthesized age 
    s_bmi = np.quantile(bmi, u_bmi)                # synthesized bmi
    s_children = np.quantile(children, u_children) # synthesized children
    s_charges = np.quantile(charges, u_charges)    # synthesized charges
    s_data.append((s_age,s_bmi,s_children, s_charges))

s_data = np.array(s_data)

<b>Assessing Quality</b>

In [7]:
s_mu = np.mean(s_data, axis=0)
print("\n")
print("Mean Synth: %5.2f %5.2f %5.2f %6.0f" % (s_mu[0],s_mu[1],s_mu[2],s_mu[3]))
print("Mean Real : %5.2f %5.2f %5.2f %6.0f" % (r_mu[0],r_mu[1],r_mu[2],r_mu[3]))
print("\n")



Mean Synth: 39.29 30.82  1.04  12920
Mean Real : 39.21 30.66  1.09  13270




In [8]:
s_corr = np.corrcoef(np.transpose(s_data))
print(r_corr)
print("\n")
print(s_corr)
print("\n")

# exercise: are pairwise scatterplots on real data similar to those on synth?
# exercise: try with other seeds

[[1.         0.10927188 0.042469   0.29900819]
 [0.10927188 1.         0.0127589  0.19834097]
 [0.042469   0.0127589  1.         0.06799823]
 [0.29900819 0.19834097 0.06799823 1.        ]]


[[1.         0.12396177 0.04096097 0.24419305]
 [0.12396177 1.         0.06499402 0.18377733]
 [0.04096097 0.06499402 1.         0.08027502]
 [0.24419305 0.18377733 0.08027502 1.        ]]




<b>Displaying the Synthesized Data</b>

In [9]:
# --- np to pandas array
s_data = pd.DataFrame(s_data, columns = ['Column_A','Column_B','Column_C','Column_D'])
print(s_data.head(10))
print("\n")

   Column_A   Column_B  Column_C      Column_D
0      28.0  26.698331       1.0  33473.814939
1      18.0  33.660000       0.0   3990.190229
2      36.0  25.175000       2.0  15166.673302
3      18.0  39.400746       3.0   7730.054046
4      62.0  30.020000       0.0  11835.946011
5      32.0  23.968111       2.0  36170.586267
6      26.0  41.684222       2.0   3750.076991
7      29.0  39.820000       0.0  13964.145682
8      36.0  35.792630       1.0  11569.847404
9      45.0  30.249154       4.0   6916.795299




and now the real data

In [10]:
r_data = np.transpose(r_data)
r_data = pd.DataFrame(r_data, columns = ['Column_A','Column_B','Column_C','Column_D'])
print(r_data.head(10))

   Column_A  Column_B  Column_C     Column_D
0      19.0    27.900       0.0  16884.92400
1      18.0    33.770       1.0   1725.55230
2      28.0    33.000       3.0   4449.46200
3      33.0    22.705       0.0  21984.47061
4      32.0    28.880       0.0   3866.85520
5      31.0    25.740       0.0   3756.62160
6      46.0    33.440       1.0   8240.58960
7      37.0    27.740       3.0   7281.50560
8      37.0    29.830       2.0   6406.41070
9      60.0    25.840       0.0  28923.13692
