## Imports

In [1]:
# Miscellaneous
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import os
# CTGAN and TVAE
from sdv.tabular import CTGAN, TVAE
# BN
from pomegranate import *
# Copula
from utils import Copula_scaler, pseudo_inverse, project_samples
# Utils
from sklearn.model_selection import train_test_split
# Evaluation
from utils import srmse, DWP, sampling_zeros
from sdmetrics import single_column
from sdv.metrics.tabular import CSTest, SVCDetection


# Traditional population synthesis

## Experiment 1:target at state level

## Data

In [2]:
data_dir = "../Data/data"
dfs = []
for subdir, dirs, files in os.walk(data_dir):
    for file in files:
        path = os.path.join(subdir, file)
        if path == "../Data/data/.DS_Store":
            continue
        county = os.path.basename(subdir)
        df = pd.read_csv(os.path.join(subdir, file),encoding = 'unicode_escape')
        df["COUNTY"] = county
        dfs.append(df)
df = pd.concat(dfs)
df = df.drop(df[df.COUNTY == "data"].index)
del df['Unnamed: 0']
df.to_csv("../Data/maryland.csv", index=False)


In [3]:
target = pd.read_csv("../Data/maryland.csv")
mapping = dict([(county, code) for code, county in enumerate(np.unique(target["COUNTY"]))])
target["COUNTY"] = target["COUNTY"].replace(mapping)
target.drop(["PUMA"], axis=1, inplace=True)
source = target.sample(frac=0.01)  # 0.1% PUMS 
target.drop(source.index, inplace=True)  # Remove source from target 
target.to_csv("../Data/exp1/target.csv", index=False)
source.to_csv("../Data/exp1/source.csv", index=False)


In [4]:
target = pd.read_csv("../Data/exp1/target.csv")
source = pd.read_csv("../Data/exp1/source.csv")
columns = target.columns
synthetic = {}  # Will hold synthetic data
models = ["CTGAN","CTGANCopula","TVAE","TVAECopula","BN", "BNCopula", "Ind"]

## CTGAN

In [5]:
field_types = {
    "HINCP": {"type": "categorical"},
    'NP': {"type": "categorical"},  # Could be numerical 
    "AGEP": {"type": 'numerical', "subtype": "integer"},
    "RAC1P": {"type": "categorical"},
    "ESR": {"type": "categorical"},
    "SEX": {"type": "categorical"},
    "WIF": {"type": "categorical"},  # Could be numerical
    "HUPAC": {"type": "categorical"},
    "HHT": {"type": "categorical"},
}

In [6]:
# Model definition
ctgan_args = {
    # "field_names": list(columns),
    "field_types":field_types,
    "embedding_dim": 128,
    "generator_dim": (256, 256),
    "discriminator_dim": (256, 256),
    "generator_lr": 2e-4,
    "generator_decay": 1e-6,
    "discriminator_lr": 2e-4,
    "discriminator_decay": 1e-6,
    "batch_size": 500,
    "discriminator_steps": 1,
    "epochs": 300,
    "cuda": False
}
ctgan = CTGAN(**ctgan_args)

In [7]:
# Training
ctgan.fit(source.drop_duplicates())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()


In [8]:
# Sampling
ctgan_data = ctgan.sample(target.shape[0])
synthetic["CTGAN"] = ctgan_data

In [9]:
ctgan.save("../Saved_Models/ctgan.pkl")
# CTGAN.load("../Saved_Models/ctgan.pkl")

## CTGAN+Copula

In [10]:
field_types = {
    "HINCP": {"type": "categorical"},
    'NP': {"type": "categorical"},  # Could be numerical 
    "AGEP": {"type": 'numerical', "subtype": "integer"},
    "RAC1P": {"type": "categorical"},
    "ESR": {"type": "categorical"},
    "SEX": {"type": "categorical"},
    "WIF": {"type": "categorical"},  # Could be numerical
    "HUPAC": {"type": "categorical"},
    "HHT": {"type": "categorical"},
}

In [11]:
# Model definition
ctgan_copula_args = {
    # "field_names": list(columns),
    "field_types":field_types,
    "embedding_dim": 128,
    "generator_dim": (256, 256),
    "discriminator_dim": (256, 256),
    "generator_lr": 2e-4,
    "generator_decay": 1e-6,
    "discriminator_lr": 2e-4,
    "discriminator_decay": 1e-6,
    "batch_size": 500,
    "discriminator_steps": 1,
    "epochs": 300,
    "cuda": False
}
ctgan_copula = CTGAN(**ctgan_copula_args)

In [12]:
scaler_source = Copula_scaler(source)
scaler_target = Copula_scaler(target)
# Copula uniform
source_enc = scaler_source.encode(source)
# Training and Sampling
ctgan_copula.fit(source_enc.drop_duplicates())
ctgan_copula_data = ctgan_copula.sample(target.shape[0])
# Resampling trick empty here first

# Pseudo inverse
ctgan_copula_data = scaler_target.decode(ctgan_copula_data)
synthetic["CTGANCopula"] = ctgan_copula_data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()


In [None]:
scaler_source = Copula_scaler(source)
scaler_target = Copula_scaler(target)
# Copula uniform
source_enc = scaler_source.encode(source)
# Training and Sampling
ctgan_copula.fit(source_enc.drop_duplicates())
ctgan_copula_data = ctgan_copula.sample(target.shape[0])
# Resampling trick empty here first
# Pseudo inverse
ctgan_copula_data = scaler_target.decode(ctgan_copula_data)
synthetic["CTGANCopula"] = ctgan_copula_data



In [13]:
ctgan_copula.save("../Saved_Models/ctgan+copula.pkl")
# CTGAN.load("../Saved_Models/ctgan+copula.pkl")

## TVAE

In [14]:
# Model definition
tvae_args = {
    # "field_names": list(columns),
    "field_types": field_types,
    "embedding_dim": 128,
    "compress_dims": (128, 128),
    "decompress_dims": (128, 128),
    "l2scale": 1e-5,
    "batch_size": 500,
    "epochs": 300,
    "cuda": False
}
tvae = TVAE(**tvae_args)

In [15]:
# Training
tvae.fit(source.drop_duplicates())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()


In [16]:
# Sampling
tvae_data = tvae.sample(target.shape[0])
synthetic["TVAE"] = tvae_data

In [17]:
tvae.save("../Saved_Models/tvae.pkl")
#TVAE.load("../Saved_Models/tvae.pkl")

## TVAE+Copula

In [18]:
# Model definition
tvae_copula_args = {
    # "field_names": list(columns),
    "field_types": field_types,
    "embedding_dim": 128,
    "compress_dims": (128, 128),
    "decompress_dims": (128, 128),
    "l2scale": 1e-5,
    "batch_size": 500,
    "epochs": 300,
    "cuda": False
}
tvae_copula = TVAE(**tvae_copula_args)

In [19]:
scaler_source = Copula_scaler(source)
scaler_target = Copula_scaler(target)
# Copula uniform
source_enc = scaler_source.encode(source)
# Training and Sampling
tvae_copula.fit(source_enc.drop_duplicates())
tvae_copula_data = tvae_copula.sample(target.shape[0])
# Resampling trick empty here first
# Pseudo inverse
tvae_copula_data = scaler_target.decode(tvae_copula_data)
synthetic["TVAECopula"] = tvae_copula_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()


In [20]:
tvae_copula.save("../Saved_Models/tvae+copula.pkl")
#TVAE.load("../Saved_Models/tvae+copula.pkl")

## BN

In [21]:
bn = BayesianNetwork.from_samples(source, algorithm="greedy")
bn_data = bn.sample(n=target.shape[0], algorithm="rejection")
bn_data = pd.DataFrame(bn_data ,columns=columns)
synthetic["BN"] = bn_data

## BN + Copula

In [22]:
scaler_source = Copula_scaler(source)
scaler_target = Copula_scaler(target)
# Copula uniform
source_enc = scaler_source.encode(source)
# Training and sampling
bn_copula = BayesianNetwork.from_samples(source_enc, algorithm="greedy")
bn_copula_data = bn_copula.sample(n=target.shape[0], algorithm="rejection")
bn_copula_data = pd.DataFrame(bn_copula_data, columns=columns)
# Resampling trick
bn_copula_data = scaler_source.resampling_trick(bn_copula_data)
# Pseudo inverse
bn_copula_data = scaler_target.decode(bn_copula_data)
synthetic["BNCopula"] = bn_copula_data


## IPF

In [23]:
# Data for IPF
#target_ipf = target.copy()
#source_ipf = source.copy()
#target_ipf["WIF"] += 1
#source_ipf["WIF"] += 1
#source_ipf.to_csv(f"../Data/exp1/ipfData/source.csv", index=False)
#for col in columns:
    #unique, counts = np.unique(target_ipf[col], return_counts=True)
    # Remove values from target that are not in source
    #unique_source = np.unique(source_ipf[col])
    #unique_cleaned = list(unique)
    #counts_cleaned = list(counts)
    #for value in unique:
        #if value not in unique_source:
            #idx = unique_cleaned.index(value)
            #unique_cleaned.pop(idx)
            #counts_cleaned.pop(idx)
    #df = pd.DataFrame(dict(zip(unique_cleaned, counts_cleaned)), index=[0])
    # Put 0 where data in source not in target
    #for value in unique_source:
        #if value not in unique:
            #df[value] = 0
    #df.to_csv(f"../Data/exp1/ipfData/{col}.csv", index=False)

In [24]:
#ipf_w = pd.read_csv("../Data/exp1/ipfData/weights.csv")
#ipf_w["weight"] = (ipf_w["weight"]).astype(int)
#ipf_w = ipf_w.loc[ipf_w.index.repeat(ipf_w["weight"])].reset_index(drop=True)
#ipf_data = ipf_w.drop(["weight", "id", "geo_all", "avg_weight" ,"weight_factor"], axis=1)
#ipf_data["WIF"] -= 1
#synthetic["IPF"] = ipf_data

### Independent baseline

In [25]:
ind_data = np.zeros(shape=target.shape)
for i in range(ind_data.shape[0]):
    for j in range(ind_data.shape[1]):
        ind_data[i,j] = source[columns[j]].sample(1)

ind_data = pd.DataFrame(ind_data, columns=columns)
synthetic["Ind"] = ind_data

## Evaluation

In [27]:
#Save data
for model in synthetic:
     df = synthetic[model]
     df.to_csv(f"../Data/exp1/synthesis/{model}.csv", index=False)

In [28]:
# Load synthetic data
for model in models:
    synthetic[model] = pd.read_csv(f"../Data/exp1/synthesis/{model}.csv")

In [29]:
csTest = CSTest(single_column.statistical.CSTest)
svcDetection = SVCDetection()

results = {}
metadata = {"fields":{}}
for col in columns:
     metadata["fields"][col] = {"type": "categorical"}

In [None]:
#For detection 
n=10000
real_sample = target.sample(n=n)
for model in synthetic:
    results[model] = {}
    df = synthetic[model]
    # Chi-squared
    cs = csTest.compute(target, df, metadata=metadata)
    results[model]["CS"] = cs
    # Detection 
    detection = svcDetection.compute(real_sample, df.sample(n=n), metadata=metadata)
    results[model]["detection"] = detection
    # SRMSE
    for i in range(1, target.shape[1]+1):
        tuples = list(itertools.combinations(columns, i))  # No repeated elements
        SRMSE = 0 
        for tuple in tuples:
            SRMSE += srmse(
                target.drop(list(columns.difference(tuple)), axis=1),
                df.drop(list(columns.difference(tuple)), axis=1))
        SRMSE /= len(tuples)
        results[model]["SRMSE"+str(i)] = SRMSE
    results[model]["Sampling Zeros"] = sampling_zeros(source, target, df)



In [None]:
results_df = []

for model in results:
    if model == "Target": continue
    results_df.append(
        pd.DataFrame({i:results[model][i] for i in results[model]}, index=[model]))

results_df = pd.concat(results_df)
results_df.to_csv("../results/exp1/metrics.csv")
results_df

In [None]:
values = [v for v in mapping.values()]
keys = [k for k in mapping.keys()]

plt.figure(figsize=(15,7))
plt.hist(
    [target["COUNTY"]] + [synthetic[model]["COUNTY"] for model in synthetic],
    label=["Target"] + [f'{model}' for model in synthetic],
    bins=values+[values[-1]+1]
)
plt.xticks(
    values, 
    keys, 
    rotation=45
)
plt.xlabel("County")
plt.ylabel("Counts")
plt.legend()
plt.savefig("../results/figures/countymarginals.png",bbox_inches='tight')
plt.show()