# Puma to Puma experiment

- NP: Number of household people
- HHT: Household or family type
- HINCP: Household income 
- HUPAC: Household presence and age of children 
- WIF: Workers in family during the last 12 months
- AGEP: Age of the person
- SEX: Gender of the person
- ESR: Employment status of the person 
- RAC1P: Recorded detailed race 

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from copula_tools import *
from utils import *

In [None]:
source = pd.read_csv('data/in/counties/Anne Arundel/PUMA_1201_Sample_Treated.csv').drop(["PUMA"] , axis=1)
target = pd.read_csv('data/in/counties/Baltimore City/PUMA_804_Sample_Treated.csv').drop(["PUMA"] , axis=1)
columns = source.columns
list(columns)

In [None]:
source_scaler = CopulaScaler()
source_scaler.fit(source)
target_scaler = CopulaScaler()
target_scaler.fit(target)

In [None]:
bn_data = sample_bn(source, len(target))
bn_scaler = CopulaScaler()
bn_scaler.fit(bn_data)

In [None]:
bn_copula_data = sample_copula(source, target, sample_bn)
copula_scaler = CopulaScaler()
copula_scaler.fit(bn_copula_data)

In [None]:
ipf_in(target, source, columns, "data/in/ipf/puma")
ipf_data = sample_ipf("data/in/ipf/puma")
ipf_scaler = CopulaScaler()
ipf_scaler.fit(ipf_data)

In [None]:
ind_data = sample_independent(source, target)

In [None]:
synthetic_data = {
    "Independent": ind_data,
    "IPF": ipf_data,
    "BN": bn_data,
    "BN Copula": bn_copula_data,
}
srmse_dict = {}

### Results

In [None]:
fig, axs = plt.subplots(2, 3, figsize=(16, 8))
columns_toplot = list(columns.drop("SEX"))
k = 0
for i in range(2):
    for j in range(3):
        axs[i, j].step(
            np.array(source_scaler.ecdf_map[columns_toplot[k]])[:,0],
            np.array(source_scaler.ecdf_map[columns_toplot[k]])[:,1],
            lw=2.5,
            alpha=0.75,
            label='Training',
        )
        axs[i, j].step(
            np.array(target_scaler.ecdf_map[columns_toplot[k]])[:,0],
            np.array(target_scaler.ecdf_map[columns_toplot[k]])[:,1],
            lw=2.5,
            alpha=0.75,
            label='Reference',
        )
        axs[i, j].step(
            np.array(ipf_scaler.ecdf_map[columns_toplot[k]])[:,0],
            np.array(ipf_scaler.ecdf_map[columns_toplot[k]])[:,1],
            linestyle='dotted', 
            markevery=2,
            lw=3,
            label='IPF',
        )
        axs[i, j].step(
            np.array(bn_scaler.ecdf_map[columns_toplot[k]])[:,0],
            np.array(bn_scaler.ecdf_map[columns_toplot[k]])[:,1],
            linestyle='dotted', 
            markevery=2,
            lw=3,
            label='BN',
        )
        axs[i, j].step(
            np.array(copula_scaler.ecdf_map[columns_toplot[k]])[:,0],
            np.array(copula_scaler.ecdf_map[columns_toplot[k]])[:,1],
            linestyle='dotted', 
            markevery=2,
            lw=3,
            label='BN + Copula',
        )
        axs[i, j].set_title(columns_toplot[k])
        k += 1 

axs[-1, -1].legend(loc='lower right', frameon=True)
plt.tight_layout()
plt.savefig("data/out/puma/ecdf.png")
plt.show()


In [None]:
result_table(source, target, synthetic_data, columns, max_projection=5, save=True, path="data/out/puma").round(2)