In [72]:
from pycaret.classification import * # Preprocessing, modelling, interpretation, deployment...
import pandas as pd # Basic data manipulation
from sklearn.model_selection import train_test_split # Data split
from sdv.tabular import CopulaGAN, GaussianCopula, CTGAN, TVAE # Synthetic data
from sdv.evaluation import evaluate # Evaluate synthetic data
import sdmetrics

In [73]:
import sklearn
print(sklearn.__version__)

0.23.2


In [74]:
original_data = pd.read_csv("HR Employee Attrition.csv")
original_data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [75]:
# Split real data into training + test set
train, test, target_train, target_test = train_test_split(original_data.drop("Attrition", axis = 1), original_data["Attrition"], test_size = 0.4, random_state = 42)
train["Attrition"] = target_train
test["Attrition"] = target_test

In [76]:
attrition_yes_size = train[train["Attrition"]=="Yes"].size
attrition_no_size = train[train["Attrition"]=="No"].size

print(attrition_yes_size)
print(attrition_no_size)
attrition_imbalance = attrition_no_size - attrition_yes_size
print(attrition_imbalance)

5460
25410
19950


In [77]:
# Target feature
target = "Attrition"

# Continuous/numeric features
cont_feats = ["DistanceFromHome", "HourlyRate",  "DailyRate", "MonthlyIncome",
              "MonthlyRate", "NumCompaniesWorked", "PercentSalaryHike",
              "TotalWorkingYears", "YearsAtCompany", "YearsInCurrentRole",
              "YearsWithCurrManager", "TrainingTimesLastYear", "YearsSinceLastPromotion"]

# Ordinal features
ord_feats = {"StockOptionLevel" : ["0", "1", "2", "3"],
             "EnvironmentSatisfaction" : ["1", "2", "3", "4"],
             "JobInvolvement" : ["1", "2", "3", "4"],
             "JobSatisfaction" : ["1", "2", "3", "4"],
             "Education" : ["1", "2", "3", "4", "5"],
             "PerformanceRating" : ["3", "4"],
             "RelationshipSatisfaction" : ["1", "2", "3", "4"],
             "WorkLifeBalance" : ["1", "2", "3", "4"]}

# Categorical geatures
cat_feats = ["BusinessTravel", "Department", "EducationField", 
             "JobRole", "Gender", "JobLevel", "JobRole", 
             "MaritalStatus", "OverTime", "WorkLifeBalance"]

# Features to ignore
ignore = ["EmployeeNumber", "StandardHours", "EmployeeCount", "Over18"]

generator_categorical_feats = cat_feats + ord_feats.keys()
generator_cont_feats

In [78]:
def classifier_setup(synth_data):
    data = train.copy().append(synth_data)
    setup(data.sample(frac=1), 
      target = target, 
      test_data = test,
      fold_strategy = "kfold",
      numeric_features = cont_feats,
      categorical_features = cat_feats,
      ordinal_features = ord_feats,
      ignore_features = ignore,
      normalize = True,
      normalize_method = "zscore",
      silent = True, verbose = False)
print("setup complete!")

setup complete!


In [79]:
models = [CopulaGAN, GaussianCopula, CTGAN, TVAE]
sample_sizes = [0, 500, 1000, 10_000, 19_950, 30_000, 40_000]
metric_classes = sdmetrics.single_table.SingleTableMetric.get_subclasses()
classifiers = ['lr','knn','nb','dt','svm','rbfsvm','gpc','mlp','ridge','rf','qda','ada','gbc','lda','et','xgboost','lightgbm','catboost']
performance = ["f1","accuracy", "recall", "precision"]


In [80]:
model_name = ["model_name"]
sample_size_name = ["sample_size"]
metric_class_names = [str(metric_name) for metric_name in metric_classes]
classifier_name = ["classifier_name"]
performance_names = ["f1","accuracy", "recall", "precision"]
columns = model_name + sample_size_name + metric_class_names + classifier_name + performance_names
results = pd.DataFrame(columns=columns)
results

Unnamed: 0,model_name,sample_size,BNLogLikelihood,LogisticDetection,SVCDetection,BinaryDecisionTreeClassifier,BinaryAdaBoostClassifier,BinaryLogisticRegression,BinaryMLPClassifier,MulticlassDecisionTreeClassifier,...,CSTest,KSTest,KSTestExtended,ContinuousKLDivergence,DiscreteKLDivergence,classifier_name,f1,accuracy,recall,precision


In [81]:
def make_score_column(score_aggregate):
    score_column = []
    for metric in metric_class_names:
        if metric in set(score_aggregate["metric"]):
            score = list(score_aggregate.loc[score_aggregate['metric'] == metric,"score"])[0]
            score_column.append(score)
        else:
            score_column.append(None)
    return score_column

In [82]:
def make_CopulaGAN(data):
    model = CopulaGAN()
    model.fit(data)
    return model
def make_GaussianCopula(data):
    model = GaussianCopula()
    model.fit(data)
    return model
def make_CTGAN(data):
    model = CTGAN()
    model.fit(data)
    return model
def make_TVAE(data):
    model = TVAE()
    model.fit(data)
    return model

In [83]:
copula_gan_model = make_CopulaGAN(train)

KeyboardInterrupt: 

In [None]:
copula_gan_model.save('CopulaGAN.pkl')

In [30]:
guassian_copula_model = make_GaussianCopula(train)

In [31]:
guassian_copula_model.save('GuassianCopulaModel.pkl')

In [32]:
ctgan_model = make_CTGAN(train)

In [33]:
ctgan_model.save("CTGAN-Model.pkl")

In [34]:
tvae_model = make_TVAE(train)

In [35]:
tvae_model.save("TVAE-Model.pkl")

In [12]:
copula_gan_model = CopulaGAN.load('CopulaGAN.pkl')
guassian_copula_model = GaussianCopula.load('GuassianCopulaModel.pkl')
ctgan_model = CTGAN.load("CTGAN-Model.pkl")
tvae_model = TVAE.load("TVAE-Model.pkl")

In [None]:
#Issues, fix ordinal features issue
#we examine ordinal feature distributions for synthetic data

In [89]:
synth = copula_gan_model.sample(500)
ordinal = list(ord_feats.keys())[0]
print(synth[ordinal].max())
print(train[ordinal].max())
print(synth[ordinal].min())
print(train[ordinal].min())
print(synth[ordinal].unique())
print(train[ordinal].unique())
print(ordinal)
copula_gan_model.get_distributions()[ordinal]

4
3
0
0
[2 1 0 3 4]
[0 1 2 3]
StockOptionLevel


'copulas.univariate.student_t.StudentTUnivariate'

In [13]:
models = [copula_gan_model, guassian_copula_model, ctgan_model, tvae_model]

In [43]:
count = 0
for model in models:
    model_column = [str(model)]
    for sample_size in sample_sizes:
        sample_size_column = [sample_size]
        synthetic_data = model.sample(sample_size)
        score_aggregate = score(aggregate=True
        score_column = make_score_column(score_aggregate)
        classifier_setup(synth_data)
        for classifier_name in classifier_names:
            classifier_column = [classifier_name]
            classifier = create_model(classifier_name) # Create the catboost classifier
            pred_holdout = predict_model(classifier)
            performance_column = list(pred_holdout["F1", "Accuracy", "Recall", "Prec."])
            results_row = model_column + sample_size_column + score_column + classifier_column + performance_column
            results.append(results_row)
            count+=1
            print(count)
            
        
        
    

NameError: name 'score' is not defined

In [49]:
#get model
model= models[0]
model_column = [str(model)]
#get sample size
sample_size = sample_sizes[1]
sample_size_column = [sample_size]
#get synethetic data
synthetic_data = model.sample(sample_size)

#score synthetic data
score_aggregate = evaluate(synthetic_data, train, aggregate=False)

None


In [51]:
score_column = make_score_column(score_aggregate)
print(score_column)

[-13.605347109454296, 0.13753052864846493, 0.0596058927142028, None, None, None, None, None, None, None, None, -4508.983407582633, 0.9625471296954762, 0.8354786324786325, 0.8672027210884353, 0.7421133825746776, 0.9114486339214659]


In [59]:
#get classification performance
classifier_setup(synthetic_data)
print("setup complete")

ValueError: Levels passed in ordinal_features param doesnt match with levels in data.

In [57]:
classifier_name = classifiers[0]
classifier_column = [classifier_name]
classifier = create_model(classifier_name) # Create the catboost classifier
pred_holdout = predict_model(classifier, verbose=False)
print('prediction done')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8539,0.8645,0.5,0.7692,0.6061,0.5213,0.5395
1,0.8764,0.761,0.3571,0.7143,0.4762,0.4148,0.447
2,0.8523,0.7439,0.4375,0.6364,0.5185,0.4348,0.4454
3,0.8295,0.8252,0.3529,0.6,0.4444,0.3517,0.369
4,0.8295,0.877,0.4286,0.75,0.5455,0.45,0.4767
5,0.8864,0.9119,0.6667,0.75,0.7059,0.6358,0.6375
6,0.875,0.9035,0.3571,0.7143,0.4762,0.414,0.4462
7,0.8864,0.8235,0.4167,0.625,0.5,0.4388,0.4503
8,0.8636,0.8123,0.3846,0.5556,0.4545,0.3796,0.3879
9,0.9205,0.7993,0.5455,0.75,0.6316,0.5882,0.5976


prediction done


In [None]:
print(pred_holdout)
performance_column = list(pred_holdout["F1", "Accuracy", "Recall", "Prec."])
results_row = model_column + sample_size_column + score_column + classifier_column + performance_column
results.append(results_row)
print(results_row)

['model_name', 'sample_size', 'BNLogLikelihood', 'LogisticDetection', 'SVCDetection', 'BinaryDecisionTreeClassifier', 'BinaryAdaBoostClassifier', 'BinaryLogisticRegression', 'BinaryMLPClassifier', 'MulticlassDecisionTreeClassifier', 'MulticlassMLPClassifier', 'LinearRegression', 'MLPRegressor', 'GMLogLikelihood', 'CSTest', 'KSTest', 'KSTestExtended', 'ContinuousKLDivergence', 'DiscreteKLDivergence', 'classifier_name', 'f1', 'accuracy', 'recall', 'precision']
