In [44]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from Soroosh_utilities import *
import tensorflow_probability as tfp
from Soroosh_feature_importance import *
from sklearn.mixture import GaussianMixture


## Synthetic data generator

To synthetically generate the data, we propose the following algorithm.

    

1. Remove statistic features added by Andrey (mean, std, etc) **(Since the original features excluded I could not implement this step)**
2. Decompose the feature space into 1) a set of dependent features ${X_D}$(those features in which their correlation is larger than a threshold, say 0.9); 2) Independent features ${X}_{I}$.
3. Fit a Gaussian Mixture Model (GMM) to estimate the independent data set parameters. (This plays the role of the prior distribution or noise which will be added to newly generated data later)
4. Train a stochastic regressor or Conditional GAN (C-GAN) algorithm using the set of data points with dependent features as the input data and the Ic_norm as target values. 
5. Once the parameters of step (2) and (3) are determined: we can generate synthetic data $\tilde{X} =  \tilde{X_D} \sim CGAN + \tilde{X_I} \sim GMM$

### Load real-world data

In [2]:

project = "SuperOX"
data_type = "real-statistics"
run_id = "GMM & CGAN defualt parameters - real " + data_type

algorithm = "A combined method for generating synthetic data"

In [3]:
# wandb initilization        
run = init_a_wandb(name=run_id, 
                   project=project,
                   notes="Synthetic Data Generator for SuperOX " + data_type, 
                   group=algorithm
                  )

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msorooshi[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.27 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [4]:
pld_complete_range = pd.read_csv("/home/soroosh/Desktop/SearchOX/data/pld_complete_range.csv",
                                 index_col=False)

In [5]:
pld_complete_range


Unnamed: 0,median_Voltage_HSR_V_1025,median_Voltage_HSR_V_1027,median_Voltage_HSR_V_1030,median_Voltage_HSL_V_1025,median_Voltage_HSL_V_1027,median_Voltage_HSL_V_1030,median_Voltage_HF_V_1025,median_Voltage_HF_V_1027,median_Voltage_HF_V_1030,median_Voltage_HC_V_1025,...,std_Sigma_1030,pos,Speed,X FWHM,Y FWHM,R FWHM,Coolness,Coolness_neg,Ic,Ic_norm
0,0.129479,0.177414,0.005142,0.695620,-0.256257,-0.015406,-0.009215,-0.072807,-0.021102,0.057123,...,-0.390667,-0.501695,0.009841,0.039589,0.067470,0.015697,0.053006,-0.086369,496.2,1.767913
1,0.110059,0.340245,0.005142,0.695620,-0.321465,-0.015406,-0.009215,-0.173901,-0.021102,0.050178,...,-0.390667,-0.501652,-0.005266,0.039589,0.067470,0.015697,0.053006,-0.086369,494.7,1.762568
2,0.119769,0.340245,0.005142,0.695620,-0.321465,-0.015406,-0.007363,-0.173901,-0.021102,0.057123,...,-0.390667,-0.501646,-0.007197,0.039589,0.067470,0.015697,0.053006,-0.086369,494.2,1.760787
3,0.110059,0.348139,0.005142,0.695620,-0.322354,-0.015406,-0.009215,-0.218270,-0.021102,0.057123,...,-0.390667,-0.501635,-0.010946,0.039589,0.067470,0.015697,0.053006,-0.086369,495.5,1.765419
4,0.110059,0.264593,0.005142,0.695620,-0.305820,-0.015406,-0.009215,-0.129533,-0.021102,0.057123,...,-0.390667,-0.501619,-0.017147,0.042682,0.069442,0.017956,0.051965,-0.084836,497.9,1.773168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18556,0.478987,-0.380360,-0.011413,0.273382,0.196202,-0.004220,0.913007,0.385112,0.010373,0.029341,...,-0.390667,0.498111,0.050599,-0.140230,-0.049842,-0.146123,0.152811,-0.269053,457.5,1.734692
18557,0.478987,-0.380360,-0.011413,0.273382,0.196202,-0.004220,0.913007,0.385112,0.010373,0.029341,...,-0.390667,0.498122,0.048394,-0.141978,-0.052491,-0.147215,0.151327,-0.266555,456.2,1.728087
18558,0.478987,-0.373412,-0.010984,0.273382,0.193093,-0.005135,0.913007,0.399703,0.014313,0.029341,...,-0.390667,0.498160,0.051788,-0.151654,-0.056603,-0.147215,0.145896,-0.257753,453.1,1.710537
18559,0.508117,-0.372889,-0.009563,0.262269,0.191020,-0.005057,0.909308,0.392447,0.014700,0.064050,...,-0.390667,0.498192,0.054743,-0.160080,-0.060184,-0.147215,0.141168,-0.250088,450.9,1.697276


In [6]:
pos_idx = pld_complete_range.columns.get_loc("pos")
ic_idx = pld_complete_range.columns.get_loc("Ic")
ic_norm_idx = pld_complete_range.columns.get_loc("Ic_norm")

pos_idx, ic_idx, ic_norm_idx

(370, 377, 378)

#### step 2: decomposing  data into dependent and independent sets

In [7]:
correlation_matrix = pld_complete_range.corr()

In [8]:
thr = .6
correlation_resutls = {"With Ic": [], "With Ic_Norm":[], "With Pos": []}
correlation_features_name = {"With Ic": [], "With Ic_Norm":[], "With Pos": []}

In [9]:
for i in range(correlation_matrix.shape[0]):
    
    if np.abs(correlation_matrix.iloc[i, ic_idx]) >= thr and i != ic_idx:
        correlation_resutls["With Ic"].append((correlation_matrix.columns[i], 
                                               correlation_matrix.iloc[i, ic_idx])
                                             )
        correlation_features_name["With Ic"].append(correlation_matrix.columns[i])
        
    
    if np.abs(correlation_matrix.iloc[i, ic_norm_idx]) >= thr and i != ic_norm_idx:
            correlation_resutls["With Ic_Norm"].append((correlation_matrix.columns[i],
                                                        correlation_matrix.iloc[i, ic_norm_idx])
                                                      )
            correlation_features_name["With Ic_Norm"].append(correlation_matrix.columns[i])


            
    if np.abs(correlation_matrix.iloc[i, pos_idx]) >= thr and i != pos_idx:
        correlation_resutls["With Pos"].append((correlation_matrix.columns[i],
                                                correlation_matrix.iloc[i, pos_idx])
                                              )
        correlation_features_name["With Pos"].append(correlation_matrix.columns[i])



In [10]:
rows = ["With Ic", "With Ic_Norm", "With Pos"]
columns = []
corr_results = pd.DataFrame.from_dict(data=correlation_resutls, orient='index')

In [11]:
corr_results

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,103,104,105,106,107,108,109,110,111,112
With Ic,"(R FWHM, 0.6265661742488199)","(Ic_norm, 0.7246221938789573)",,,,,,,,,...,,,,,,,,,,
With Ic_Norm,"(median_Voltage_HSL_V_1027, -0.6257993710069113)","(median_Temp_HSL_C_1025, 0.6588738008082772)","(median_Temp_HF_C_1025, -0.6948493686250121)","(median_Power_HSL_W_1027, -0.624955218650732)","(median_Power_HC_W_1025, 0.6321551573870462)","(median_Current_HSR_A_1025, -0.6069783972212103)","(median_Right_Clatch_prc_1025, 0.6754791451497...","(mean_Voltage_HSL_V_1027, -0.6378317780422785)","(mean_Voltage_HC_V_1025, 0.6027429652269497)","(mean_Temp_HSL_C_1025, 0.659003919504415)",...,,,,,,,,,,
With Pos,"(median_Voltage_HSR_V_1027, -0.7413642044786511)","(median_Voltage_HSL_V_1027, 0.9033652226290593)","(median_Voltage_HF_V_1027, 0.7136514314378173)","(median_Voltage_HC_V_1025, -0.8030834544887947)","(median_Voltage_HC_V_1027, -0.7306137537557599)","(median_Voltage_HB_V_1025, -0.7075372731749584)","(median_Temp_HSL_C_1025, -0.9248021381688128)","(median_Temp_HSL_C_1027, -0.68747683116201)","(median_Temp_HF_C_1025, 0.9769295928246139)","(median_Temp_HC_C_1027, -0.6968243558598846)",...,"(std_Speed_m_h_1030, 0.7297876312145513)","(std_Right_Tension_n_1027, -0.8016200820518362)","(std_Right_Clatch_prc_1027, -0.6196557795436284)","(std_Left_Tension_n_1030, 0.6117060594721982)","(std_Left_Clatch_prc_1027, -0.630802066829034)","(std_HV_1025, 0.7421547533376962)","(X FWHM, -0.8672705352844962)","(Y FWHM, -0.8948050248176074)","(R FWHM, -0.8230285246416147)","(Ic_norm, -0.6923768236297578)"


In [12]:
features_name = pld_complete_range.columns.to_list()

In [13]:
# features_name

In [14]:
stats = ["median", "mad", "mean", "std", ]

- Note: Except for 'pos', 'Speed', 'X FWHM', 'Y FWHM', 'R FWHM', 'Coolness', 'Coolness_neg' Andrey Sapranov has replaced the original features with those statistics. Thus for now we can only generate these statistics.

In [15]:
dependent_features = correlation_features_name["With Ic_Norm"]
independent_features = list(set(pld_complete_range.columns) - set(dependent_features))


In [16]:
len(independent_features) + len(dependent_features) == pld_complete_range.shape[1]

True

In [17]:
independent_features.remove("Ic_norm")
len(independent_features)

358

In [18]:
x_i = pld_complete_range.loc[:, independent_features]
x_i.head()

Unnamed: 0,std_Voltage_HSL_V_1027,mean_Voltage_HB_V_1027,std_Temp_HC_C_1027,std_Voltage_HB_V_1025,std_Power_HF_W_1025,std_TubeTemp_1025,mean_Pressure_1025,std_Egy_1027,mad_Power_HB_W_1025,median_Power_HSR_W_1025,...,mean_O2_02_sccm_1025,mean_TubeTemp_1030,mean_Voltage_HSR_V_1030,median_Power_HSR_W_1030,std_Power_HF_W_1030,mad_Current_HC_A_1025,median_Temp_HB_C_1027,mad_Current_HSL_A_1030,std_Speed_m_h_1027,median_Left_Tension_n_1030
0,0.008875,-0.018468,-0.067998,-0.027973,-0.031396,-0.331198,0.025859,-0.000666,-0.176163,0.044424,...,0.029381,-0.2,0.104463,-0.004163,-0.22509,-0.05987,0.090533,-0.001955,0.146216,-0.018362
1,0.00394,-0.020781,-0.075852,-0.022261,-0.031209,-0.331198,0.033275,-0.000666,-0.059636,0.043648,...,0.007485,-0.2,0.104463,-0.004163,-0.22509,-0.109773,0.154923,-0.001955,0.15395,-0.018362
2,0.002389,-0.014089,-0.074856,-0.016988,-0.028958,-0.331198,0.024406,-0.000666,0.039925,0.033405,...,0.03703,-0.2,0.104463,-0.004163,-0.22509,-0.093207,0.154923,-0.001955,0.165554,-0.018362
3,0.009048,-0.011557,-0.074316,-0.021198,-0.02783,-0.331198,0.022837,-0.000666,0.056892,0.042873,...,0.02875,-0.2,0.104463,-0.004163,-0.22509,-0.126441,0.16152,-0.001955,0.176823,-0.018362
4,-0.003156,-0.01361,-0.075367,-0.02362,-0.030019,-0.331198,0.019568,-0.000666,0.056892,0.044424,...,0.044684,-0.2,0.104463,-0.004163,-0.22509,-0.093207,0.134989,-0.001955,0.155395,-0.018362


In [19]:
x_d = pld_complete_range.loc[:, dependent_features]
x_d.head()

Unnamed: 0,median_Voltage_HSL_V_1027,median_Temp_HSL_C_1025,median_Temp_HF_C_1025,median_Power_HSL_W_1027,median_Power_HC_W_1025,median_Current_HSR_A_1025,median_Right_Clatch_prc_1025,mean_Voltage_HSL_V_1027,mean_Voltage_HC_V_1025,mean_Temp_HSL_C_1025,mean_Temp_HF_C_1025,mean_Power_HSL_W_1027,mean_Power_HC_W_1025,mean_Current_HSR_A_1025,mean_Right_Clatch_prc_1025,mean_HV_1025,pos,X FWHM,Y FWHM,Ic
0,-0.256257,0.530118,-0.338464,-0.268036,0.308059,-0.047888,0.373487,-0.306729,0.055449,0.569428,-0.274883,-0.302971,0.33451,-0.041601,0.367443,-0.31256,-0.501695,0.039589,0.06747,496.2
1,-0.321465,0.530118,-0.338464,-0.317925,0.291699,-0.047888,0.373487,-0.325816,0.039791,0.566937,-0.279634,-0.321841,0.311391,-0.049355,0.369593,-0.31256,-0.501652,0.039589,0.06747,494.7
2,-0.321465,0.565838,-0.338464,-0.317925,0.30009,-0.047888,0.373487,-0.324621,0.052392,0.572111,-0.279874,-0.320622,0.326897,-0.054141,0.368418,-0.31256,-0.501646,0.039589,0.06747,494.2
3,-0.322354,0.530118,-0.338464,-0.321034,0.29212,-0.047888,0.373487,-0.327209,0.049092,0.572317,-0.281761,-0.323207,0.322001,-0.05345,0.369471,-0.31256,-0.501635,0.039589,0.06747,495.5
4,-0.30582,0.530118,-0.338464,-0.314816,0.308059,-0.012797,0.373487,-0.318481,0.056544,0.566939,-0.281372,-0.315437,0.332101,-0.044377,0.372196,-0.31256,-0.501619,0.042682,0.069442,497.9


In [20]:
y = pld_complete_range.loc[:, "Ic_norm"]
y.head()

0    1.767913
1    1.762568
2    1.760787
3    1.765419
4    1.773168
Name: Ic_norm, dtype: float64

### Splitting data to train and test for evaluation of generated results

In [21]:
np.random.seed(42)
n_samples = pld_complete_range.shape[0]
n_train_samples = int(.7*n_samples)
all_indices = np.arange(n_samples).tolist()
train_indices = np.random.choice(all_indices, n_train_samples, replace=False).tolist()
test_indices = list(set(all_indices) - set(train_indices))

In [22]:
len(set(train_indices)) == len(train_indices)

True

In [23]:
len(set(test_indices)) == len(test_indices)

True

In [24]:
x_d_train = x_d.iloc[train_indices]
x_i_train = x_i.iloc[train_indices]
y_train = pd.DataFrame(y.iloc[train_indices])
x_d_train.shape, x_i_train.shape, y_train.shape

((12992, 20), (12992, 358), (12992, 1))

In [25]:
x_d_test = x_d.iloc[test_indices]
x_i_test = x_i.iloc[test_indices]
y_test = pd.DataFrame(y.iloc[test_indices])

x_d_test.shape, x_i_test.shape, y_test.shape

((5569, 20), (5569, 358), (5569, 1))

In [26]:
print(x_i_test.shape[0] + x_i_train.shape[0] == x_i.shape[0], 
      x_d_test.shape[0] + x_d_train.shape[0] == x_d.shape[0])

True True


### step 3:  GMM and C-GAN

#### fitting GMM:

In [27]:
gmm = GaussianMixture(n_components=1, covariance_type='full')
gmm.fit(X=x_i_train, y=y_train)

GaussianMixture()

In [28]:
x_i_new = gmm.sample(int(1.*x_i_test.shape[0]))


In [36]:
x_i_new[0].shape, x_i_new[1].shape, 

((5569, 358), (5569,))

In [37]:
x_i_hat = pd.DataFrame(x_i_new[0])
y_i_hat = pd.DataFrame(x_i_new[1])

In [58]:
def correlation_metric(data1, data2):
    
    if data1.shape[1] != data2.shape[1]:
        print("Matrices should have equal number of features")
        return None

    n_features = data1.shape[1]
    features_correlation = np.zeros([n_features])
    
    for i in range(n_features):
        features_correlation[i] = data1.iloc[:, i].corr(data2.iloc[:, i])
    
    return features_correlation

In [59]:
x_i_hat_correlations = correlation_metric(x_i_test, x_i_hat)

In [87]:
pos_well_generated = []
neg_well_generated = []
pos_bad_generated = []
neg_bad_generated = []

thr = .7

for i in range(x_i_hat_correlations.shape[0]):
    cor_val = x_i_hat_correlations[i]
    if cor_val >= 0:
        if cor_val >= thr:
            
            pos_well_generated.append(independent_features[i])
            
        else:
            
            pos_bad_generated.append(independent_features[i])
    else:
        
        if cor_val <= thr:
            
            neg_well_generated.append(independent_features[i])
        
        else:
            
        
        
            
        
        
print(len(not_well_generated))

357


- As we can see, by using the default values of the GMM regressor, it properly learns the distribution of independent features.

#### training conditional GAN

https://github.com/Diyago/GAN-for-tabular-data

https://towardsdatascience.com/review-of-gans-for-tabular-data-a30a2199342

In [None]:
from tabgan.sampler import OriginalGenerator, GANGenerator



In [None]:
x_d_gan, y_gan = OriginalGenerator().generate_data_pipe(x_d_train, y_train, x_d_test, )


In [None]:
x_d_gan



In [None]:
x_d_cgan, y_cgan = GANGenerator().generate_data_pipe(x_d_train, y_train, x_d_test, )



In [None]:
x_d_cgan



In [None]:
x_d

In [None]:
tmp1 = correlation_metric(x_d, x_d)
tmp2 = x_d.corr().values

In [None]:
cmp = []
for i in range(tmp1.shape[0]):
    for j in range(tmp1.shape[1]):
        cmp.append("%6.f" % tmp1[i, j] == "%6.f" % tmp2[i, j])
        
set(cmp)