In [None]:
%%capture
# install DataSynthesizer (cannot be included in conda)
!pip install DataSynthesizer

In [None]:
import pandas as pd
import numpy as np
import DataSynthesizer
import os

# Read & preprocess data

We read the data from the CMAPPS folder. We remove the last two columns as these solely contain N/A values and then we rename the columns with their respective names as defined in "readme.txt".
We then store the data as a comma-separated-value (csv) file instead of a text file with tabs, as DataSynthesizer works with csv files.

In [None]:
data = pd.read_csv('CMAPSS/train_FD001.txt', sep=" ", header=None)

# drop last two columns with N/A values
data = data.iloc[:, :-2]

# rename columns according to readme.txt
col_names = ["unit-nr", "timecycle", "ops-set1", "ops-set2", "ops-set3"]
for i in range(1,22):
    col_names.append(f"sens-{i}")
data.columns = col_names
data.to_csv('CMAPSS/train_FD001_pre.csv', index=False)

data_length = len(data)

# display data
data

## Create synthetic data

In [None]:
"""
Creating the synthetic data using the Git page from DataSynthesizer
https://github.com/DataResponsibly/DataSynthesizer/blob/master/notebooks/DataSynthesizer__correlated_attribute_mode.ipynb

NOTE: First create the description file (e.g. via terminal command touch)
before running the code, does not write the .json file itself.
"""

from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.lib.utils import read_json_file, display_bayesian_network

def create_data_adult(data_length):
    # input dataset
    input_data = 'CMAPSS/train_FD001_pre.csv'

    # location of two output files
    mode = 'correlated_attribute_mode'
    description_file = f'./CMAPSS/Synthetic/description_FD001.json'
    synthetic_data = f'./CMAPSS/Synthetic/synthetic_data_FD001.csv'

    # An attribute is categorical if its domain size is less than this threshold.
    threshold_value = 42

    # A parameter in Differential Privacy. It roughly means that removing a row in the input dataset will not 
    # change the probability of getting the same output more than a multiplicative difference of exp(epsilon).
    # Increase epsilon value to reduce the injected noises. Set epsilon=0 to turn off differential privacy.
    epsilon = 0

    # The maximum number of parents in Bayesian network, i.e., the maximum number of incoming edges.
    degree_of_bayesian_network = 2

    # Number of tuples generated in synthetic dataset.
    num_tuples_to_generate = data_length

    describer = DataDescriber(category_threshold=threshold_value)
    describer.describe_dataset_in_correlated_attribute_mode(dataset_file=input_data, 
                                                            epsilon=epsilon, 
                                                            k=degree_of_bayesian_network)
    describer.save_dataset_description_to_file(description_file)

    # Generate data set
    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
    generator.save_synthetic_data(synthetic_data)

create_data_adult(data_length)

# Read & output synthetic data

In [None]:
syn_data = pd.read_csv('./CMAPSS/Synthetic/synthetic_data_FD001.csv')
syn_data

In [None]:
data