# DataSynthesizer

This notebook is used to create synthetic data with DataSynthesizer. It will first read and preprocess the CMAPPS data, after which it generate synthetic data which will then be analysed compared to the original data. This notebook is used for the results section on DataSynthesizer.

In [3]:
%%capture
# install DataSynthesizer (cannot be included in conda)
!pip install DataSynthesizer

In [4]:
import pandas as pd
import numpy as np
import DataSynthesizer
import os

# Read & preprocess data

We read the data from the CMAPPS folder. We remove the last two columns as these solely contain N/A values and then we rename the columns with their respective names as defined in "readme.txt".
We then store the data as a comma-separated-value (csv) file instead of a text file with tabs, as DataSynthesizer works with csv files.

In [8]:
data = pd.read_csv('CMAPSS/train_FD001.txt', sep=" ", header=None)

# drop last two columns with N/A values
data = data.iloc[:, :-2]

# rename columns according to readme.txt
col_names = ["unit-nr", "timecycle", "ops-set1", "ops-set2", "ops-set3"]
for i in range(1,22):
    col_names.append(f"sens-{i}")
data.columns = col_names
data.to_csv('CMAPSS/train_FD001_pre.csv', index=False)

data_length = len(data)

# display data
data

Unnamed: 0,unit-nr,timecycle,ops-set1,ops-set2,ops-set3,sens-1,sens-2,sens-3,sens-4,sens-5,...,sens-12,sens-13,sens-14,sens-15,sens-16,sens-17,sens-18,sens-19,sens-20,sens-21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.70,1400.60,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.4190
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.00,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.20,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0000,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.80,8.4294,0.03,393,2388,100.0,38.90,23.4044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,519.49,2388.26,8137.60,8.4956,0.03,397,2388,100.0,38.49,22.9735
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.50,1433.58,14.62,...,519.68,2388.22,8136.50,8.5139,0.03,395,2388,100.0,38.30,23.1594
20628,100,198,0.0004,0.0000,100.0,518.67,643.42,1602.46,1428.18,14.62,...,520.01,2388.24,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,519.67,2388.23,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.0640


## Create synthetic data

In [4]:
"""
Creating the synthetic data using the Git page from DataSynthesizer
https://github.com/DataResponsibly/DataSynthesizer/blob/master/notebooks/DataSynthesizer__correlated_attribute_mode.ipynb

NOTE: First create the description file (e.g. via terminal command touch)
before running the code, does not write the .json file itself.
"""

from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.lib.utils import read_json_file, display_bayesian_network

def create_data(data_length):
    # input dataset
    input_data = 'CMAPSS/train_FD001_pre.csv'

    # location of two output files
    mode = 'correlated_attribute_mode'
    description_file = f'./CMAPSS/Synthetic/description_FD001.json'
    synthetic_data = f'./CMAPSS/Synthetic/DataSyn_FD001.csv'

    # An attribute is categorical if its domain size is less than this threshold.
    threshold_value = 42

    # A parameter in Differential Privacy. It roughly means that removing a row in the input dataset will not 
    # change the probability of getting the same output more than a multiplicative difference of exp(epsilon).
    # Increase epsilon value to reduce the injected noises. Set epsilon=0 to turn off differential privacy.
    epsilon = 0

    # The maximum number of parents in Bayesian network, i.e., the maximum number of incoming edges.
    degree_of_bayesian_network = 2

    # Number of tuples generated in synthetic dataset.
    num_tuples_to_generate = data_length

    describer = DataDescriber(category_threshold=threshold_value)
    describer.describe_dataset_in_correlated_attribute_mode(dataset_file=input_data, 
                                                            epsilon=epsilon, 
                                                            k=degree_of_bayesian_network)
    describer.save_dataset_description_to_file(description_file)

    # Generate data set
    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
    generator.save_synthetic_data(synthetic_data)

create_data(data_length)

Adding ROOT sens-8
Adding attribute sens-13
Adding attribute sens-12
Adding attribute sens-11
Adding attribute sens-4
Adding attribute sens-7
Adding attribute sens-15
Adding attribute sens-21
Adding attribute sens-20
Adding attribute sens-2
Adding attribute sens-17
Adding attribute sens-3
Adding attribute timecycle
Adding attribute sens-14
Adding attribute sens-9
Adding attribute unit-nr
Adding attribute ops-set1
Adding attribute ops-set2
Adding attribute sens-6
Adding attribute ops-set3
Adding attribute sens-1
Adding attribute sens-5
Adding attribute sens-10
Adding attribute sens-16
Adding attribute sens-18
Adding attribute sens-19


  for parents_instance, stats_sub in stats.groupby(parents):


# Read & output synthetic data

In [5]:
syn_data = pd.read_csv('./CMAPSS/Synthetic/DataSyn_FD001.csv')
syn_data[:10]

Unnamed: 0,unit-nr,timecycle,ops-set1,ops-set2,ops-set3,sens-1,sens-2,sens-3,sens-4,sens-5,...,sens-12,sens-13,sens-14,sens-15,sens-16,sens-17,sens-18,sens-19,sens-20,sens-21
0,60.0,197.0,0.004186,0.0003,100.0,518.67,643.09913,1588.139155,1411.97443,14.62,...,521.114645,2388.088113,8156.34324,8.48211,0.03,395,2388,100.0,38.791177,23.22971
1,99.0,74.0,0.000252,0.0005,100.0,518.67,643.411595,1588.761515,1410.289857,14.62,...,521.521348,2388.118976,8131.198048,8.428558,0.03,394,2388,100.0,39.10326,23.348387
2,48.0,202.0,-0.000377,0.0,100.0,518.67,642.525685,1586.537478,1407.430623,14.62,...,521.328054,2388.055948,8146.537609,8.414437,0.03,391,2388,100.0,38.894074,23.297967
3,57.0,34.0,-4.4e-05,0.0002,100.0,518.67,642.925005,1589.771001,1400.055742,14.62,...,521.649525,2388.102871,8136.752208,8.437895,0.03,393,2388,100.0,38.96741,23.255113
4,83.0,158.0,-8.9e-05,0.0006,100.0,518.67,642.844252,1587.759665,1405.832447,14.62,...,521.956721,2388.139336,8160.909896,8.422177,0.03,393,2388,100.0,38.887428,23.323196
5,3.0,84.0,-0.001209,0.0002,100.0,518.67,642.570253,1590.943306,1410.928967,14.62,...,521.646106,2388.093062,8138.460336,8.413403,0.03,394,2388,100.0,38.811338,23.241986
6,5.0,121.0,-0.003726,0.0005,100.0,518.67,642.608293,1584.472653,1410.157601,14.62,...,521.714837,2388.092668,8134.53929,8.460024,0.03,393,2388,100.0,38.910923,23.194549
7,32.0,184.0,0.001405,-0.0004,100.0,518.67,642.582289,1592.390703,1415.362561,14.62,...,520.480452,2388.17755,8135.166908,8.438584,0.03,392,2388,100.0,38.595596,23.141877
8,92.0,113.0,-0.002634,-0.0003,100.0,518.67,644.17599,1601.508029,1421.989821,14.62,...,520.009734,2388.307165,8122.924902,8.475136,0.03,396,2388,100.0,38.578765,23.078557
9,4.0,58.0,-0.001524,-0.0004,100.0,518.67,642.369954,1592.427032,1401.061698,14.62,...,521.913086,2388.024091,8137.944131,8.450982,0.03,394,2388,100.0,39.036018,23.329811


What we see directly is that the synthetic data has a different ordering for the unit numbers and timecylces. Where these are sequential for the original data, we see that the synthetic data just samples them from a distribution. In general, this would work for values as the mean and standard deviation will probably be similar. However, in this case the unit numbers and timecycles are relevant and should be sequential as well.

In [7]:
data[:10]

Unnamed: 0,unit-nr,timecycle,ops-set1,ops-set2,ops-set3,sens-1,sens-2,sens-3,sens-4,sens-5,...,sens-12,sens-13,sens-14,sens-15,sens-16,sens-17,sens-18,sens-19,sens-20,sens-21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044
5,1,6,-0.0043,-0.0001,100.0,518.67,642.1,1584.47,1398.37,14.62,...,521.68,2388.03,8132.85,8.4108,0.03,391,2388,100.0,38.98,23.3669
6,1,7,0.001,0.0001,100.0,518.67,642.48,1592.32,1397.77,14.62,...,522.32,2388.03,8132.32,8.3974,0.03,392,2388,100.0,39.1,23.3774
7,1,8,-0.0034,0.0003,100.0,518.67,642.56,1582.96,1400.97,14.62,...,522.47,2388.03,8131.07,8.4076,0.03,391,2388,100.0,38.97,23.3106
8,1,9,0.0008,0.0001,100.0,518.67,642.12,1590.98,1394.8,14.62,...,521.79,2388.05,8125.69,8.3728,0.03,392,2388,100.0,39.05,23.4066
9,1,10,-0.0033,0.0001,100.0,518.67,641.71,1591.24,1400.46,14.62,...,521.79,2388.06,8129.38,8.4286,0.03,393,2388,100.0,38.95,23.4694


Compare statistics

In [25]:
# select data with unit-nr 1.0
for i in range(1,6): 
    df_syn = syn_data.loc[syn_data['unit-nr'] == i]
    df_cmapps = data.loc[data['unit-nr'] == i]
    
    print(f"Number of data points for unit-nr {i} for CMAPPS: {len(df_cmapps)}. For synthetic data: {len(df_syn)}")
    
print("\nThe number of unique values per unit number:")

for i in range(1,6): 
    df_syn = syn_data.loc[syn_data['unit-nr'] == i].drop_duplicates(subset='timecycle')
    df_cmapps = data.loc[data['unit-nr'] == i].drop_duplicates(subset='timecycle')
    
    
    print(f"Number of unique values in timestamp for unit-nr {i} for CMAPPS: {len(df_cmapps)}. For synthetic data: {len(df_syn)}")

    
df_syn

Number of data points for unit-nr 1 for CMAPPS: 192. For synthetic data: 116
Number of data points for unit-nr 2 for CMAPPS: 287. For synthetic data: 238
Number of data points for unit-nr 3 for CMAPPS: 179. For synthetic data: 232
Number of data points for unit-nr 4 for CMAPPS: 189. For synthetic data: 241
Number of data points for unit-nr 5 for CMAPPS: 269. For synthetic data: 205

The number of unique values per unit number:
Number of unique values in timestamp for unit-nr 1 for CMAPPS: 192. For synthetic data: 91
Number of unique values in timestamp for unit-nr 2 for CMAPPS: 287. For synthetic data: 159
Number of unique values in timestamp for unit-nr 3 for CMAPPS: 179. For synthetic data: 157
Number of unique values in timestamp for unit-nr 4 for CMAPPS: 189. For synthetic data: 146
Number of unique values in timestamp for unit-nr 5 for CMAPPS: 269. For synthetic data: 145


Unnamed: 0,unit-nr,timecycle,ops-set1,ops-set2,ops-set3,sens-1,sens-2,sens-3,sens-4,sens-5,...,sens-12,sens-13,sens-14,sens-15,sens-16,sens-17,sens-18,sens-19,sens-20,sens-21
6,5.0,121.0,-0.003726,0.0005,100.0,518.67,642.608293,1584.472653,1410.157601,14.62,...,521.714837,2388.092668,8134.539290,8.460024,0.03,393,2388,100.0,38.910923,23.194549
60,5.0,51.0,0.000534,0.0004,100.0,518.67,642.804950,1584.517400,1398.882752,14.62,...,522.628072,2388.008953,8146.704608,8.441217,0.03,393,2388,100.0,38.843174,23.452462
103,5.0,190.0,-0.001541,-0.0003,100.0,518.67,643.094424,1595.773301,1416.269606,14.62,...,520.691903,2388.208674,8112.107801,8.483249,0.03,395,2388,100.0,38.593104,23.300577
259,5.0,214.0,-0.003747,0.0000,100.0,518.67,642.374442,1587.498796,1407.245164,14.62,...,521.641386,2388.074402,8148.206355,8.457052,0.03,391,2388,100.0,39.006195,23.343835
290,5.0,30.0,-0.000093,-0.0002,100.0,518.67,643.216141,1583.357374,1407.274110,14.62,...,522.251130,2388.043378,8121.370538,8.366364,0.03,391,2388,100.0,38.964154,23.348597
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18800,5.0,254.0,-0.001541,0.0001,100.0,518.67,642.776412,1596.896066,1420.494502,14.62,...,520.624601,2388.118890,8150.416685,8.455008,0.03,394,2388,100.0,38.451399,23.199266
19113,5.0,14.0,-0.000049,0.0002,100.0,518.67,642.715261,1581.784733,1400.723537,14.62,...,521.854256,2388.097121,8156.488344,8.418950,0.03,392,2388,100.0,38.946986,23.443250
19183,5.0,73.0,0.001702,-0.0002,100.0,518.67,642.764527,1593.121119,1413.627676,14.62,...,521.687571,2388.057698,8151.910038,8.452127,0.03,394,2388,100.0,38.972032,23.291394
19476,5.0,201.0,0.000178,0.0001,100.0,518.67,643.035385,1590.033280,1409.134482,14.62,...,520.402033,2388.175195,8144.959776,8.465902,0.03,396,2388,100.0,38.691017,23.147499


# Export exploratory data analysis to csv files

In [23]:
syn_description = syn_data.describe()
syn_description
syn_description.T.to_csv("./CMAPSS/Synthetic/synthetic_data_FD001_description.csv", float_format="{:.5f}".format)

In [25]:
data.describe()
syn_description.T.to_csv("./CMAPSS/CMAPPS_FD001_description.csv", float_format="{:.5f}".format)