Everything here should ernd up ebing in its own function, split by cell

In [None]:
# CONFIG FILE PARAMETERS
data_path = "../data/"
random_seed = ...
output_filepath = "../data/"
...

# Utility functions
Function to be used by any file

In [None]:
# Plot a given set of data (try and abstract this to handle all plot calls for
# a given type of plot.)
...

In [None]:
import os

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

In [None]:
from collections.abc import Callable
from copy import deepcopy

In [None]:
def inplacify(func):
    """Decorates a function to implement inplace changes to a mutable
    or returns a copy of the mutable with the changes.

    Parameters
    ----------
    func : Callable
        The function to add the inplace functionality.

    Returns
    -------
    Callable
        Wrapped function with the inplace functionality.
    """
    def wrapper(*args, **kwargs):
        # get the inplace keyword argument value
        inplace = kwargs.pop("inplace", True)
        if inplace:
            # inplace modifies all mutables that are changed inside func 
            func(*args, **kwargs)
        else:
            # make a copy of the first positional argument, i.e, assume this is the mutable to keep unchanged
            acopy = deepcopy(args[0])
            # returns the function applied to the copy and the rest of the arguments
            return func(acopy, *args[1:], **kwargs)
    return wrapper


In [None]:
# Output function(s) which supports needde types of outputs (csv, json, ...)
...

# Loading the data

**At this point we are not considering the sampling weights which are not identical for all individuals in the PUMF then the estimates calculated using this dataare not representative of the (survey) population.**

In [None]:
# system imports
import json
import pickle

import ydata_profiling

import numpy as np
import pandas as pd

from ydata_profiling import ProfileReport
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer

---
Tested with ydata_profiling v4.5.1, pickle v4.0

---

In [None]:
ydata_profiling.__version__, pickle.format_version

In [None]:
def load_cchs_smk(data_path):
    """Load CCHS SMOKE data dictionary pickled file from data_path

    Parameters
    ----------
    data_path : str
        The path for the pickle file CCHS_SMK.pkl
        
    Returns
    -------
    dict
        A dictionary with CCHS PUMF SMOKE dataframe, with datatypes and categorical features.
        For instance, data_dict['dtypes']['GEO_PRV'] contains datatype = int64.
        data_dict['cat']['GEO_PRV'] the possible values as in the original data.
        data_dict['dataframe']['GEO_PRV'] the dataframe for this column with ADM_RNO as index.
    """
    with open(data_path+"CCHS_SMK.pkl", "rb") as f:
        data_dict = pickle.load(f)
    return data_dict

In [None]:
data_dict = load_cchs_smk(data_path)

## Preliminary checks

### Provinces
Remap provinces to sequential codes and compare to PUMF documentation.

In [None]:
@inplacify
def remap_provinces(data_dict, inplace=True):
    """Remap provinces to sequential codes

    Parameters
    ----------
    data_dict : dict
        A modified dictionary with CCHS PUMF SMOKE dataframe, with datatypes and categorical features.
        For instance, data_dict['dtypes']['GEO_PRV'] contains datatype = int64.
        data_dict['cat']['GEO_PRV'] the possible values as in the original data.
        data_dict['dataframe']['GEO_PRV'] the dataframe for this column with ADM_RNO as index.

    inplace : bool
        Change data_dict if inplace is True

    Returns
    -------
    dict
        If inplace is False returns a new data_dict modified.
    """
    provinces_dict ={}
    provinces_map = {}
    # original order of codes in terms of provinces names
    provinces = ["NEWFOUNDLAND AND LABRADOR",
                 "PRINCE EDWARD ISLAND",
                 "NOVA SCOTIA",
                 "NEW BRUNSWICK",
                 "QUEBEC",
                 "ONTARIO",
                 "MANITOBA",
                 "SASKATCHEWAN",
                 "ALBERTA",
                 "BRITISH COLUMBIA",
                 "YUKON",
                 "NORTHWEST TERRITORIES",
                 "NUNAVUT"]
    for i, (k, p) in enumerate(zip(sorted(data_dict["cat"]["GEO_PRV"]), provinces)):
        provinces_dict[i] = p
        provinces_map[k] = i
    
    # remap provinces codes
    data_dict["dataframe"]["GEO_PRV"] = data_dict["dataframe"]["GEO_PRV"].map(provinces_map)
    return data_dict

In [None]:
data_dict_test = remap_provinces(data_dict, inplace=False)

In [None]:
data_dict_test["dataframe"]["GEO_PRV"].value_counts()

**Let's compare with original PUMF documentation**

In [None]:
data_dict["dataframe"]["GEO_PRV"].value_counts().sort_index()

![Provinces](./img/provinces.png "Provinces")

This proves that the data based on provinces was loaded correctly.

### Health regions
Repeat process above with health regions

In [None]:
@inplacify
def map_regions(data_dict, inplace=True):
    """Remap health regions to sequential codes

    Parameters
    ----------
    data_dict : dict
        A modified dictionary with CCHS PUMF SMOKE dataframe, with datatypes and categorical features.
        For instance, data_dict['dtypes']['GEO_PRV'] contains datatype = int64.
        data_dict['cat']['GEO_PRV'] the possible values as in the original data.
        data_dict['dataframe']['GEO_PRV'] the dataframe for this column with ADM_RNO as index.

    inplace : bool
        Change data_dict if inplace is True

    Returns
    -------
    dict
        If inplace is False returns a new data_dict modified.
    """
    # read the health regions from file (same order as PUMF)
    regions_df = pd.read_csv(data_path+"health_regions.txt", sep="+", header=0, names=["Name"])
    regions_map = {}
    for k, p in zip(sorted(data_dict["cat"]["GEODGHR4"]), regions_df.index):
        regions_map[k] = p
    # remap regions codes
    data_dict["dataframe"]["GEODGHR4"] = data_dict["dataframe"]["GEODGHR4"].map(regions_map)
    return data_dict

In [None]:
map_regions(data_dict_test, inplace=True)

In [None]:
data_dict["dataframe"]["GEODGHR4"].value_counts().sort_index().head(12)

![regions](./img/regions.png "regions")

## Selected column and mappings

In [None]:
# read regions from file
regions_dict = pd.read_csv(data_path+"health_regions_orig.txt", sep="+", header=0, index_col=0).to_dict()

In [None]:
# dictionary keyed by column name, with description and mappings Names to PUMF values
var_map = {
    "GEO_PRV": {"description": "Province", "var_map": {
        "NEWFOUNDLAND AND LABRADOR": 10,
        "PRINCE EDWARD ISLAND":      11,
        "NOVA SCOTIA":               12,
        "NEW BRUNSWICK":             13,
        "QUEBEC":                    24,
        "ONTARIO":                   35,
        "MANITOBA":                  46,
        "SASKATCHEWAN":              47,
        "ALBERTA":                   48,
        "BRITISH COLUMBIA":          59,
        "YUKON":                     60,
        "NORTHWEST TERRITORIES":     61,
        "NUNAVUT":                   62}
        },
    "GEODGHR4": {"description": "Province", "var_map": regions_dict['Code']
        },
    "DHH_SEX": {"description": "Sex", "var_map": {
        "Male": 1,
        "Female": 2}
        },
    "DHHGMS": {"description": "Marital Status", "var_map": {
        "Married":	                    1,
        "Common-law":	                2,
        "Widowed/Divorced/Separated":	3,
        "Single":	                    4,
        "Not stated":	                9}
        },
    "DHHGAGE": {"description": "Age", "var_map": {
        "Age between 12 and 14":	1,
        "Age between 15 and 17":	2,
        "Age between 18 and 19":	3,
        "Age between 20 and 24":	4,
        "Age between 25 and 29":	5,
        "Age between 30 and 34":	6,
        "Age between 35 and 39":	7,
        "Age between 40 and 44":	8,
        "Age between 45 and 49":	9,
        "Age between 50 and 54":	10,
        "Age between 55 and 59":	11,
        "Age between 60 and 64":	12,
        "Age between 65 and 69":	13,
        "Age between 70 and 74":	14,
        "Age between 75 and 79":	15,
        "Age 80 and older":	16}
        },
    "GEN_005": {"description": "Perceived health", "var_map": {
        "Excellent": 1,
        "Very good": 2,
        "Good":	     3,
        "Fair":	     4,
        "Poor":	     5,
        "Don’t know":7,
        "Refusal":	 8}
        },    
    "GEN_015": {"description": "Perceived mental health", "var_map": {
        "Excellent": 1,
        "Very good": 2,
        "Good":	     3,
        "Fair":	     4,
        "Poor":	     5,
        "Don’t know":7,
        "Refusal":	 8,
        "Not stated":9}
        },
    "GEN_020": {"description": "Perceived life stress", "var_map": {
        "Not at all stressful":	1,
        "Not very stressful":	2,
        "A bit stressful":	    3,
        "Quite a bit stressful":4,
        "Extremely stressful":	5,
        "Don’t know":	        7,
        "Refusal":	            8}
        },
    "GEN_025": {"description": "Perceived stress at work", "var_map": {
        "Not at all stressful":	1,
        "Not very stressful":	2,
        "A bit stressful":	    3,
        "Quite a bit stressful":4,
        "Extremely stressful":	5,
        "Valid skip":	        6,
        "Don’t know":	        7,
        "Refusal":	            8,
        "Not stated":	        9}
        },
    "SMK_005": {"description": "Type of smoker (daily / occasionally / not at all) - presently", "var_map": {
        "Daily":	    1,
        "Occasionally":	2,
        "Not at all":	3,
        "Don’t know":	7,
        "Refusal":	8}
        },
    "SMK_015": {"description": "During the past 30 days, did you smoke every day?", "var_map": {
        "Yes":	1,
        "No":	2,
        "Valid skip":	6,
        "Don’t know":	7,
        "Not stated":	9}
        },
    "SMK_020": {"description": "Smoked more than 100 cigarettes - lifetime", "var_map": {
        "Daily":	    1,
        "Occasionally":	2,
        "Not at all":	3,
        "Don’t know":	7,
        "Refusal":	8}
        },
    "SMK_030": {"description": "Smoked daily - lifetime (occasional / former smoker)", "var_map": {
        "Yes":        	1,
        "No":	        2,
        "Valid skip":	6,
        "Don’t know":	7,
        "Refusal":	    8,
        "Not stated":	9}
       },
}

In [None]:
# extract dataframe from data_dict
df = data_dict["dataframe"]

In [None]:
df.describe()

For the extracted chunk from PUMF we have 67 columns and 113290 rows

In [None]:
df.head()

In [None]:
def make_report(data_dict):
    """Make a report using ydata_profiling.

    Parameters
    ----------
    data_dict : dict
        A modified dictionary with CCHS PUMF SMOKE dataframe, with datatypes and categorical features.
        For instance, data_dict['dtypes']['GEO_PRV'] contains datatype = int64.
        data_dict['cat']['GEO_PRV'] the possible values as in the original data.
        data_dict['dataframe']['GEO_PRV'] the dataframe for this column with ADM_RNO as index.

    Returns
    -------
    dict
        The report in a python dict.

    obj
        The report object.
    """
    df = data_dict["dataframe"]
    type_schema = {k:"categorical" if "int" in v.name else "numeric" for k, v in data_dict['dtypes'].items()}
    profile = ProfileReport(df, title="Report", type_schema=type_schema)#, minimal=True)
    # export to json, html version of the report is to heavy and the browser crashes
    report_json = profile.to_json()
    report_dict = json.loads(report_json)
    return report_dict, profile

In [None]:
%%time
report_dict, report = make_report(data_dict)

In [None]:
report_dict["table"]

In [None]:
report_dict["alerts"]

We have some alerts on correlation and imbalance:
- Height **HWTDGHTM** is correlayed to weight **HWTDGHTM**, which also are correlated to the Body Mass Index **HWTDGBMI**
- Daily smokers and non-smokers in **SMK_005** will be correlated to **SMK_010**: *In the past 30 days, did you smoke any cigarettes?* Other correlations have similar interpretations.
- Imbalance **SMK_005** is intuitive, small percentage of the population smokes cigarettes.

`var_map` dict contains a subset of the 67 columns based in the observations above.

In [None]:
%matplotlib inline

In [None]:
#matplotlib.use("QtAgg")

## Preliminary dataset

In [None]:
# the categorical features are on the var_map dictionary
cat_cols = list(var_map.keys())

In [None]:
cat_cols

In [None]:
# the two numerical columns are height and weight
num_cols = ["HWTDGHTM", "HWTDGWTK"]

In [None]:
# invert dictionaries to do the mapping from codes to names
col_map_i = {}
for k, v in var_map.items():
    aux = {}
    for kk, vv in v['var_map'].items():
        aux[vv] = kk
    col_map_i[k] = aux

In [None]:
# slice columns of interest
pre_df = df[num_cols+cat_cols].copy()

In [None]:
# let's replace values with string mappings
pre_df.replace(col_map_i, inplace=True)

In [None]:
# drop ADM index
pre_df.reset_index(drop=True, inplace=True)

In [None]:
# change index name to ID
pre_df.index.name = "ID"

In [None]:
# let's create a column transformer, which will apply OrdinalEncoder to categorical features
ct = make_column_transformer((OrdinalEncoder(dtype=int), cat_cols), ('passthrough', num_cols), verbose_feature_names_out=False)

In [None]:
# we want pandas dataframe output after transformation
ct.set_output(transform="pandas")

In [None]:
fit_pre_df = ct.fit_transform(pre_df)

In [None]:
# Plot correlation matrix
fig, ax = plt.subplots(figsize=(8, 6))
ax.grid(False)
sns.heatmap(fit_pre_df.corr(), square=True, linewidths=0.5, cmap="PiYG", mask=np.triu(np.ones_like(fit_pre_df.corr(), dtype=bool)))

In [None]:
# todo pass the right type schema to generate the report
profile = ProfileReport(fit_pre_df, title="Report")#, type_schema=type_schema)

In [None]:
# the report can be heavy let's dump it in a dict
report_json = profile.to_json()
report_dict = json.loads(report_json)

In [None]:
report_dict["table"]

In [None]:
report_dict["alerts"]

In [None]:
# let's save the first version of the cleaned preprocessed data
cleaned_data = fit_pre_df.copy()
cleaned_data.to_csv(f"{data_path}/cleaned_data.csv")

In [None]:
cleaned_data

### Split by provinces

In [None]:
# let's inverse the transformation for categorical columns
inverse_cleaned_data = pd.DataFrame(ct.transformers_[0][1].inverse_transform(cleaned_data[cat_cols]), columns=cat_cols)

In [None]:
province_cleaned_data = cleaned_data.copy()

In [None]:
# get provinces codes
provinces = province_cleaned_data["GEO_PRV"].unique()
provinces

In [None]:
# get provinces names
province_names = inverse_cleaned_data["GEO_PRV"].unique()

In [None]:
province_names

In [None]:
# dump csv files by province
for p, pname in zip(provinces, province_names):
    print(pname, p)
    province_cleaned_data[province_cleaned_data["GEO_PRV"]==p].to_csv(f"{data_path}/{pname}.csv")

### Split by health regions

In [None]:
if not os.path.exists(f"{data_path}/regions"): 
    # create the regions dir
    os.makedirs(f"{data_path}/regions")

In [None]:
# get health regions codes
regions = province_cleaned_data["GEODGHR4"].unique()
regions

In [None]:
# get provinces names
region_names = inverse_cleaned_data["GEODGHR4"].unique()
region_names

In [None]:
# dump csv files by health regions
for r, rname in zip(regions, region_names):
    print(rname, r)
    cleaned_name = "".join(i for i in rname if i not in "\/:*?<>=|.,()").replace(" ", "_").replace("’","-")
    province_cleaned_data[province_cleaned_data["GEODGHR4"]==r].to_csv(f"{data_path}/regions/{cleaned_name}.csv")
    province_cleaned_data[province_cleaned_data["GEODGHR4"]==r].to_csv(f"{data_path}/regions/{r}.csv")

In [None]:
#profile

In [None]:
# Data cleaning (not the selection of columns, but any other clearning steps)
cleaned_data = ...

In [None]:
# Stratify into groups via column groupby
stratified_data = ...

In [None]:
# Stratify into groups via custom scheme
# Specifically, specifyapproximate number of clients and clients get some number
# of samples pulled from some distribution
stratified_data = ...

Need to determine whether we want to generate the stratified datasets in a static way, then change the sample size, or vice-versa.

In [None]:
# Update the number of samples to be used
stratified_data = ...

Note that data loading should not be done during each run, but rather be pre-generated and loaded appropriately.

In [None]:
# Select the column(s) to be used within the data
stratified_data = stratified_data[...]

In [None]:
# Save dataset(s)
...

In [None]:
# Print data information
# When called, provides information relevant to the provided data
...

### 

# Queries
Code related to the statistal queries being run. This excludes any evaluations being done, only the queries themselves.
Depending on the code structure for the DP libraries used, we may need to use different function calls for mean, sum, ... The mean, sum, and frequency functions should be abstracted such that the global DP file's functions can call them appropriately.

In [None]:
# Mean
...

In [None]:
# Sum
...

In [None]:
# Frequency
...

In [None]:
# Any others to add
...

# Evaluation
Code which compare the results from queries and/or the data itself.

In [None]:
# Explore the distribution of a provided dataset
...

In [None]:
# Compare the distributions of two datasets
...

In [None]:
# Compare the difference between two queries
...

# Local Differential Privacy
Code exclusive for applying LDP on the data.

In [None]:
# Function for generating each noise type, abstracting other parts (laplace, RR, ...)
...

In [None]:
# Function injecting the noise to the provided data

# Global Differential Privacy
Code exclusive for applying GDP on the data.

In [None]:
# Differentially private queries (see Queries file section)
...

# Shuffle Differential Privacy
Code exclusive for applying SDP on the data.

In [None]:
# Function to act as the shuffler
...

# Model Tests
Code exclusive fro testing DP applied to different models (wish list)

In [None]:
# Regression models
...

In [None]:
# ML models
...