In [None]:
proj = 'Tratamento A'

# 01 - Imports

## 01.A - Installing Required Packages

## 01.B - Loading Packages

In [None]:
import warnings

warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA

from scipy.stats import ks_2samp,wasserstein_distance

import pickle
import numpy as np
import pandas as pd
import time
from functools import partial

import matplotlib.pyplot as plt
import seaborn as sns
from plotly import tools
import plotly.graph_objects as go

import tensorflow as tf
tf.compat.v1.disable_eager_execution()
from tensorflow.keras import models, layers, optimizers
from tensorflow.keras import applications
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping
from tensorflow.keras import regularizers
from tensorflow.keras.losses import mse, binary_crossentropy
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense, Flatten, Conv1D

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval

## 01.C - Custom Classes and Functions

In [None]:
# Custom Classes and Functions
from classes import xplor, compare_metric
from ctabgan import CTABGAN

In [None]:
def uni_plot_1d(df_, title):

    # Inputs para gerar o grid de gráficos
    n_cols = df_.shape[1]

    plot_r = int(np.ceil((n_cols/4)))
    plot_c = 4

    c = 0
    r = 0

    # for loop para plotar todas as variáveis
    W_List = {}
    fig, ax = plt.subplots(plot_r , plot_c, figsize = (4*plot_c,3*plot_r))
    for column in df_.columns:
        W_List[column] = []

        if (df_[column].dtype == 'object') | pd.CategoricalDtype.is_dtype(df_[column]):

            gb = df_.groupby([column]).size()
            gb = np.round((gb/gb.sum())*100,1)

            compare = pd.concat([gb], axis = 1).fillna(0)
            compare.columns = ['Real']

            compare[['Real']].plot(kind = 'bar', ax = ax[r,c], alpha = 0.8)
            ax[r,c].set_title(column)
            ax[r,c].legend()

            c+=1

            if c > 3:
                c = 0
                r+=1

        else:           
            bins = np.histogram_bin_edges(df_[column], 20)
            sns.histplot(data=df_  , x=column, bins = bins, ax = ax[r,c], color = 'blue', alpha = 0.5)
            ax[r,c].set_title(column)
            ax[r,c].legend()

            c+=1
            if c > 3:
                c = 0
                r+=1


    plt.suptitle(title, fontsize = 15)
    plt.tight_layout()
    plt.show()
    

# 02 - Data Reading

In [None]:
#od.download("https://www.kaggle.com/rameshmehta/credit-risk-analysis/version/1")
data = pd.read_csv("Real_Datasets/adult-Copy1.csv", header = None)

In [None]:
data.shape

In [None]:
data = data.drop(4, axis = 1)

In [None]:
data.columns = colunas

In [None]:
colunas = data.columns.copy()
colunas

# 03 - Data Cleansing

In [None]:
# Criando classe para fazer limepza dos dados de forma automatizada
# Essa classe consegue analisar e identificar variáveis com baixa qualidade em relação a valores nulos,
# variáveis categóricas com muitas variáveis, variáveis do tipo data, variáveis com altíssima variância 
# e variáveis com variância nula
xp = xplor(data)

In [None]:
# Checando variáveis nulas.
# É realizado um gráfico de pareto para a % de nulos em cada variável.

# De acordo com o parâmetro 'level', serão selecionadas (para exclusão) as variaveis que 
# ultrapassarem o valor desse parâmetro. Ou seja, nesse acso todas as variáveis com mais de 50%
# de nulos serão selecionadas para exclusão

# Foram encontradas 21 variáveis.

xp.check_nulls(level = 0, select = True)
print(xp.nulls)

In [None]:
# Este próximo método visa identificar variáveis categóricas que possuem uma quantidade alta de valores únicos,
# Neste exemplo, toda variável categórica com mais de 20 categrias distintas será selecionada para exclusão.

# Neste caso foram encontradas 10 variáveis.

# O Gráfico de pareto é mostrado para ajudar na identificação visual

xp.check_unique_objects(level_unique = 50,select = True)

In [None]:
# O próximo método visa identificar variáveis do tipo data/
# Para este experimento, essas variáveis serão excluídas.

# Neste exemplo, foram encontradas 5 variáveis

xp.check_dates(select = True)

In [None]:
# Neste último método, o objetivo é encontrar variáveis que possuam uma variância normalizada muito alta ou nula.
# O interessante foi verificar que ela se mostrou últi para encontrar 
# as duas colunas relacionadas ao ID (com altíssima variância) e uma coluna sem variância.

xp.check_var(select = True)

In [None]:
# Para finalizar esse processo, é executado o método 'clean_data',
# que vai pegar todas as variáveis identificadas nos métodos anteriores e vai excluí-las da base final

new_df = xp.clean_data()

In [None]:
import sklearn
sklearn.__version__

In [None]:
pip install scikit-learn==0.24.2

In [None]:
new_df = data.copy()

In [None]:
# Com isso, foram excluídas 34 variáveis.
# A base final agora possúi 39 variáveis.
new_df.shape

# 04 - Data Treatmeant Pipeline (Pre-Processing)

In [None]:
new_df.to_csv("Real_Datasets/Adults_full.csv", index = False)

In [None]:
new_df.columns

In [None]:
ctbgan = CTABGAN(raw_csv_path = "Real_Datasets/Adults_full.csv",
                 test_ratio = 0.20,
                 categorical_columns = [ 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income'], 
                 log_columns = [],
                 mixed_columns= {'capital-loss':[0.0],'capital-gain':[0.0]},
                 integer_columns = ['age', 'fnlwgt','capital-gain', 'capital-loss','hours-per-week'],
                 problem_type= {"Classification": 'income'},
                 epochs = 150)

In [None]:
ctbgan.fit()

In [None]:
synth = ctbgan.generate_samples()

In [None]:
synth.head()

## Fidelidade

In [None]:
# Calculando a distância de Wasserstein para todas as colunas para os três modelos
W_df_tst = compare_metric(df_real = new_df
                      ,dfs_synth = [synth]
                      , metric = 'wasserstein')
W_df_tst

In [None]:
W_df_tst.quantile(q = [0,0.5,1]).append( pd.DataFrame(W_df_tst.mean().values, index = ['avg'], columns = ['Data_0']))

In [None]:
new_df_2 = new_df.copy()
cat_vars = ['workclass','education','marital-status','occupation','relationship','race','gender','native-country','income']
new_df_2[cat_vars] = new_df_2[cat_vars].astype(int).astype('object')

synth_2 = synth.copy()
cat_vars = ['workclass','education','marital-status','occupation','relationship','race','gender','native-country','income']
synth_2[cat_vars] = synth_2[cat_vars].astype(int).astype('object')

In [None]:
#synth.to_csv("fake_adult_full.csv", index = False)

In [None]:
synth = pd.read_csv("fake_adult_full.csv")

In [None]:
# gerando gráficos para o melhor modelo 
# Desse modo é possível comparar a distribuição de cada variável entre os dados sintéticos e reais
df_synth = synth
df_ = new_df

# Inputs para gerar o grid de gráficos
n_cols = new_df.shape[1]

plot_r = int(np.ceil((n_cols/4)))
plot_c = 4

c = 0
r = 0

# for loop para plotar todas as variáveis
W_List = {}
fig, ax = plt.subplots(plot_r , plot_c, figsize = (5*plot_c,4*plot_r))
for column in df_.columns:
    W_List[column] = []

    if (df_[column].dtype == 'object') | pd.CategoricalDtype.is_dtype(df_[column]):
        
        gb = df_.groupby([column]).size()
        gb = np.round((gb/gb.sum())*100,1)
        
        gb_s = df_synth.groupby([column]).size()
        gb_s = np.round((gb_s/gb_s.sum())*100,1)

        compare = pd.concat([gb,gb_s], axis = 1).fillna(0)
        compare.columns = ['Real','Synthetic']
        compare['DIFF'] = np.round(abs(compare['Real'] - compare['Synthetic']),2)

        compare[['Real','Synthetic']].plot(kind = 'bar', ax = ax[r,c], alpha = 0.8)
        ax[r,c].set_title(column+":"+ str(compare['DIFF'].abs().sum()))
        ax[r,c].legend()
        
        c+=1

        if c > 3:
            c = 0
            r+=1

        le = LabelEncoder()
        le.fit(df_[column])

        mms = MinMaxScaler()
        mms.fit(le.transform(df_[column]).reshape(-1,1))

        W_List[column].append(wasserstein_distance( np.squeeze(mms.transform(le.transform(df_synth[column].dropna()).reshape(-1,1)))
                                           ,np.squeeze(mms.transform(le.transform(df_      [column]         ).reshape(-1,1)))
                                          )
                     )

    else:           
        bins = np.histogram_bin_edges(df_[column], 20)
        sns.histplot(data=df_  , x=column, bins = bins, ax = ax[r,c], color = 'blue', alpha = 0.5, label = 'Real')
        sns.histplot(data=df_synth, x=column, bins = bins, ax = ax[r,c], color = 'red', alpha = 0.4, label = 'Synthetic')
        ax[r,c].set_title(column)
        ax[r,c].legend()
        
        c+=1
        if c > 3:
            c = 0
            r+=1
                
        

        mms = MinMaxScaler()
        mms.fit(df_[[column]])

        W_List[column].append(wasserstein_distance( np.squeeze(mms.transform(df_synth[[column]]))
                                           ,np.squeeze(mms.transform(df_[[column]]))
                                          )
                     )
plt.tight_layout()

## Utilidade

In [None]:
from sklearn.ensemble import RandomForestClassifier


from sklearn.preprocessing import OrdinalEncoder

base = ['Adults Full']

new_df2 = new_df.copy()

new_df2.columns = synth.columns.values

enc = OrdinalEncoder()
new_df2[new_df2.select_dtypes(include = ['object']).columns] = enc.fit_transform(new_df2.select_dtypes(include = ['object']).copy())


RX_m = new_df2.drop("income", axis = 1).copy()
RY_m = new_df2[['income']].copy()

X_trainR, X_testR, Y_trainR, Y_testR  = train_test_split(RX_m, RY_m , test_size=0.2, random_state=42)

RFC_R = RandomForestClassifier()
RFC_R.fit(X_trainR,Y_trainR)

synth2 = synth.dropna().copy()
synth2[synth2.select_dtypes(include = ['object']).columns] = enc.transform(synth2.select_dtypes(include = ['object']).copy())


SX_m = synth2.drop("income", axis = 1).copy()


SY_m = synth2[['income']].copy()


# X_trainS = SX_m.loc[X_trainR.index,:]
# X_testS  = SX_m.loc[X_testR.index,:]
# Y_trainS = SY_m.loc[Y_trainR.index,:]
# Y_testS  = SY_m.loc[Y_testR.index,:]

X_trainS, X_testS, Y_trainS, Y_testS  = train_test_split(SX_m, SY_m , test_size=0.2, random_state=42)

RFC_S = RandomForestClassifier()
RFC_S.fit(X_trainS,Y_trainS)


#Treino Real, Teste Real
final_predict_RR = pd.DataFrame({'Y':Y_testR.income.values
                                 ,"RR_pred":RFC_R.predict(X_testR)
                                 ,"RR_prob":RFC_R.predict_proba(X_testR)[:,1]}, index = X_testR.index)


#Treino Sintético, Teste Sintético
final_predict_SS = pd.DataFrame({'Y':Y_testS.income.values,
                                 "SS_pred":RFC_S.predict(X_testS),
                                 "SS_prob":RFC_S.predict_proba(X_testS)[:,1]}, index = X_testS.index)


#Treino Sintético, Teste Real
final_predict_SR = pd.DataFrame({'Y':Y_testR.income.values,
                                 "SR_pred":RFC_S.predict(X_testR),
                                 "SR_prob":RFC_S.predict_proba(X_testR)[:,1]}, index = X_testR.index)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score



In [None]:
final_predict_SR['SR_pred'] = final_predict_SR['SR_pred'].astype(int)
final_predict_SS[['Y','SS_pred']] = final_predict_SS[['Y','SS_pred']].astype(int)

In [None]:
testR = pd.concat( [X_testR, Y_testR, final_predict_RR, final_predict_SR[['SR_pred','SR_prob']]], axis = 1).copy()
testR['model'] = 'test'
testS = pd.concat( [X_testS, Y_testS,final_predict_SS], axis = 1).copy()
testS['model'] = 'test'
trainR = pd.concat( [X_trainR, Y_trainR], axis = 1).copy()
trainR['model'] = 'train'
trainS = pd.concat( [X_trainS, Y_trainS], axis = 1).copy()
trainS['model'] = 'train'

R = pd.concat([trainR,testR], axis = 0)
R['type'] = 'Real'
R['base'] = base[0]
R['sint_model'] = 'VAE2'


S = pd.concat([trainS,testS], axis = 0)
S['type'] = 'Synt'
S['base'] = base[0]
S['sint_model'] = 'VAE2'


base_olga = pd.DataFrame()
base_olga = base_olga.append(R)
base_olga = base_olga.append(S)

In [None]:
df_m = pd.DataFrame([])
metricas = [accuracy_score, f1_score, precision_score, recall_score, roc_auc_score]
for m in metricas:
    if m.__name__ != "roc_auc_score":
        temp = pd.DataFrame(
                {
                    "RR":m(final_predict_RR.Y, final_predict_RR.RR_pred),
                    "SS":m(final_predict_SS.Y, final_predict_SS.SS_pred),
                    "SR":m(final_predict_SR.Y, final_predict_SR.SR_pred)
                }, index = [m.__name__])
        df_m = df_m.append(temp)
    else:
        temp = pd.DataFrame(
                {
                    "RR":m(final_predict_RR.Y, final_predict_RR.RR_prob),
                    "SS":m(final_predict_SS.Y, final_predict_SS.SS_prob),
                    "SR":m(final_predict_SR.Y, final_predict_SR.SR_prob)
                }, index = [m.__name__])
        df_m = df_m.append(temp)
        

In [None]:
df_m.round(2)

## Privacidade

In [None]:
from sklearn.preprocessing import MinMaxScaler
new_df3 = new_df2.copy()

enc2 = MinMaxScaler()
new_df3 = pd.DataFrame(enc2.fit_transform(new_df2.copy()), columns = new_df2.columns)
synth3 = pd.DataFrame(enc2.transform(synth2.copy()), columns = synth2.columns)

In [None]:
from sklearn.neighbors import NearestNeighbors

neigh = NearestNeighbors(n_neighbors=2, radius=0.6)
neigh.fit(new_df3)
distances=map(lambda x: neigh.kneighbors(synth3.iloc[[x]], 2, return_distance=True)[0][0], range(synth3.shape[0]))
ids=map(lambda x: neigh.kneighbors(synth3.iloc[[x]], 2, return_distance=True)[1][0], range(synth3.shape[0]))
distances_l = pd.DataFrame(list(distances), index = synth3.index)
ids_l = pd.DataFrame(list(ids), index = synth3.index)

temp = pd.concat([distances_l,ids_l], axis = 1)
temp['base'] = base[0]

In [None]:
fig, ax = plt.subplots(1,2, figsize = (10,3))
distances_l[0].append(distances_l[1]).hist(ax = ax[0])
distances_l[0].append(distances_l[1]).plot(kind = 'box', ax = ax[1])


In [None]:
temp.columns = ['distancia_1','distancia_2','id_proximo_1','id_proximo_2','base']
temp.reset_index().to_csv('base_distancia_CTABGAN_AF.csv', index = False)

In [None]:
pd.DataFrame(distances_l[0].append(distances_l[1]).quantile(q = [0,0.05,0.25,0.5,0.75,1]))

In [None]:
pd.DataFrame(distances_l[0].append(distances_l[1]).quantile(q = [0,0.05,0.25,0.5,0.75,1]))

In [None]:
from sklearn.neighbors import NearestNeighbors

neigh = NearestNeighbors(n_neighbors=2, radius=0.6)
neigh.fit(new_df3)
distances=map(lambda x: neigh.kneighbors(synth3.iloc[[x]], 2, return_distance=True)[0][0], range(synth3.shape[0]))
ids=map(lambda x: neigh.kneighbors(synth3.iloc[[x]], 2, return_distance=True)[1][0], range(synth3.shape[0]))
distances_l = pd.DataFrame(list(distances))
ids_l = pd.DataFrame(list(ids))

In [None]:
fig, ax = plt.subplots(1,2, figsize = (10,3))
distances_l[0].append(distances_l[1]).hist(ax = ax[0])
distances_l[0].append(distances_l[1]).plot(kind = 'box', ax = ax[1])


In [None]:
pd.DataFrame(distances_l[0].append(distances_l[1]).quantile(q = [0,0.05,0.25,0.5,0.75,1]))