In [1]:
from pycaret.classification import * # Preprocessing, modelling, interpretation, deployment...
import pandas as pd # Basic data manipulation
from sklearn.model_selection import train_test_split # Data split
from sdv.tabular import CopulaGAN, GaussianCopula, CTGAN, TVAE # Synthetic data
from sdv.evaluation import evaluate # Evaluate synthetic data
import sdmetrics
import sklearn
import os

In [2]:
original_data = pd.read_csv("HR Employee Attrition.csv")
original_data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'HR Employee Attrition.csv'

In [3]:
import pandas as pd # Basic data manipulation

data = pd.read_csv("data.csv")
small_data = data[["Age", "Attrition", "DistanceFromHome"]]
small_data.to_csv("small_data.csv")

In [6]:
models()

NameError: name 'logger' is not defined

In [None]:
def split_data(dataset_path ="data.csv", output_directory="data", train_filename = "train.csv", test_filename = "test.csv", target_name="TARGET"):
    """
        Split real data into training + test set
    inputs---
    dataset_path:   path to dataset csv
    output_directory:      relative or absolute directory to store outputs in
    train_filename: filename to use for train data split, include .csv
    test_filename:  filename to use for test data split, include .csv
    target_name:    name of target column in dataset
    return---
    [train_dataset output path, test_dataset output path] 
    """
    data = pd.read_csv(dataset_path)
    data.head()
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    train, test, target_train, target_test = train_test_split(data.drop(target_name, axis = 1), data[target_name], test_size = 0.4, random_state = 42)
    train[target_name] = target_train
    test[target_name] = target_test
    train.to_csv(os.path.join(output_directory, train_filename))
    test.to_csv(os.path.join(output_directory, test_filename))

In [1]:
SDV_GENGERATORS = ["ctGAN", "gaussain_copula", "copulaGAN", "tvae"]
def create_default_generators(train_dataset="data/train.csv", generators=SDV_GENGERATORS, type=``, output=None):
    data = pd.read_csv(train_dataset)
    models = []
    assert(all(name in generators))
    name_to_model = \
    {"ctGAN":CTGAN, "gaussain_copula":GaussianCopula, "copulaGAN":CopulaGAN, "tvae":TVAE}
    for name in generators:
        assert(name in SDV_GENGERATORS)
        models.append(name_to_model[name])
    trained_models = []
    output_directories = []
    for model, name in zip(models, generators):
        model_instance = model()
        model_instance.fit(data)
        output_directory = "default_" + name + ".pkl"
        model_instance.save(output_directory)
        output_directories.append(output_directory)
    return output_directories

SyntaxError: invalid syntax (<ipython-input-1-b87c3587ee21>, line 2)

In [4]:
import os
if not os.path.exists('my_folder'):
    os.makedirs('my_folder')

In [5]:
attrition_yes_size = train[train["Attrition"]=="Yes"].size
attrition_no_size = train[train["Attrition"]=="No"].size

print(attrition_yes_size)
print(attrition_no_size)
attrition_imbalance = attrition_no_size - attrition_yes_size
print(attrition_imbalance)

5460
25410
19950


In [6]:
# Target feature
target = "Attrition"

# Continuous/numeric features
cont_feats = ["DistanceFromHome", "HourlyRate",  "DailyRate", "MonthlyIncome",
              "MonthlyRate", "NumCompaniesWorked", "PercentSalaryHike",
              "TotalWorkingYears", "YearsAtCompany", "YearsInCurrentRole",
              "YearsWithCurrManager", "TrainingTimesLastYear", "YearsSinceLastPromotion"]

# Ordinal features
ord_levels = ['StockOptionLevel', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobSatisfaction', 
              'Education', 'PerformanceRating', 'RelationshipSatisfaction', 'WorkLifeBalance']

# Categorical geatures
cat_feats = ["BusinessTravel", "Department", "EducationField", 
             "JobRole", "Gender", "JobLevel", "JobRole", 
             "MaritalStatus", "OverTime", "WorkLifeBalance"]

# Features to ignore
ignore = ["EmployeeNumber", "StandardHours", "EmployeeCount", "Over18"]

In [7]:
def classifier_setup(data):
    setup(data.sample(frac=1), 
      target = target, 
      test_data = test,
      fold_strategy = "kfold",
      numeric_features = cont_feats,
      categorical_features = cat_feats,
      ordinal_features = ord_feats,
      ignore_features = ignore,
      normalize = True,
      normalize_method = "zscore",
      silent = True, verbose = False)
print("setup complete!")

setup complete!


In [8]:
models = [CopulaGAN, GaussianCopula, CTGAN, TVAE]
sample_sizes = [500, 1000, 10_000, 19_950, 30_000, 40_000] #baseline function defined later handles sample = 0
metric_classes = sdmetrics.single_table.SingleTableMetric.get_subclasses()
classifiers = ['lr','knn','nb','dt','svm','rbfsvm','gpc','mlp','ridge','rf','qda','ada','gbc','lda','et','xgboost','lightgbm','catboost']
classifiers_test = ["catboost", "lr"]
performance = ["f1","accuracy", "recall", "precision"]

In [9]:
model_name = ["model_name"]
sample_size_name = ["sample_size"]
metric_class_names = [str(metric_name) for metric_name in metric_classes]
classifier_name = ["classifier_name"]
performance_names = ["accuracy", "f1", "recall", "precision"]
columns = model_name + sample_size_name + metric_class_names + classifier_name + performance_names
results = pd.DataFrame(columns=columns)
results

Unnamed: 0,model_name,sample_size,BNLogLikelihood,LogisticDetection,SVCDetection,BinaryDecisionTreeClassifier,BinaryAdaBoostClassifier,BinaryLogisticRegression,BinaryMLPClassifier,MulticlassDecisionTreeClassifier,MulticlassMLPClassifier,LinearRegression,MLPRegressor,GMLogLikelihood,CSTest,KSTest,KSTestExtended,ContinuousKLDivergence,DiscreteKLDivergence,classifier_name,accuracy,f1,recall,precision


In [10]:
def make_score_column(score_aggregate):
    score_column = []
    for metric in metric_class_names:
        if metric in set(score_aggregate["metric"]):
            score = list(score_aggregate.loc[score_aggregate['metric'] == metric,"score"])[0]
            score_column.append(score)
        else:
            score_column.append(None)
    return score_column

In [11]:
def make_CopulaGAN(data):
    model = CopulaGAN()
    model.fit(data)
    return model
def make_GaussianCopula(data):
    model = GaussianCopula()
    model.fit(data)
    return model
def make_CTGAN(data):
    model = CTGAN()
    model.fit(data)
    return model
def make_TVAE(data):
    model = TVAE()
    model.fit(data)
    return model

In [12]:
copula_gan_model = CopulaGAN.load('CopulaGAN.pkl')
guassian_copula_model = GaussianCopula.load('GuassianCopulaModel.pkl')
ctgan_model = CTGAN.load("CTGAN-Model.pkl")
tvae_model = TVAE.load("TVAE-Model.pkl")

In [13]:
models = [copula_gan_model, guassian_copula_model, ctgan_model, tvae_model]

In [38]:
def baseline():
    output = pd.DataFrame(columns=columns)
    count=0
    for classifier_name in classifiers_test:
        classifier_name = classifiers[0]
        classifier_column = [classifier_name]
        classifier = create_model(classifier_name)
        #pycaret, predict on test set
        pred_holdout = predict_model(classifier, verbose=False)
        #evaluate performance
        y_true = pred_holdout["Attrition"]
        y_pred = pred_holdout["Label"]
        prf = sklearn.metrics.precision_recall_fscore_support(y_true, y_pred)
        precision = prf[0][1]
        recall = prf[1][1]
        f1 = prf[2][1]
        accuracy = sklearn.metrics.accuracy_score(y_true, y_pred)
        performance_column = [accuracy, f1, recall, precision]
        #record entry
        results_row = model_column + sample_size_column + score_column + classifier_column + performance_column
        row_dict = {columns[i]:results_row[i] for i in range(len(columns))}
        output = output.append(row_dict, ignore_index=True)
        count+=1
        print(count)
    return output

In [None]:
results = pd.DataFrame(columns=columns)
count = 0
for model in models:
    model_column = [str(model)]
    for sample_size in sample_sizes:
        sample_size_column = [sample_size]
        print(sample_size)
        print(model)
        synthetic_data = model.sample(int(sample_size))
        score_aggregate = evaluate(synthetic_data, train, aggregate=False)
        score_column = make_score_column(score_aggregate)
        combined_data = train.copy().append(synthetic_data)
        ord_feats = {}
        for feat in ord_levels:
            ord_feats[feat] = [str(each) for each in sorted(list(combined_data[feat].unique()))]
        classifier_setup(combined_data)
        for classifier_name in classifiers_test:
            classifier_name = classifiers[0]
            classifier_column = [classifier_name]
            classifier = create_model(classifier_name)
            #pycaret, predict on test set
            pred_holdout = predict_model(classifier, verbose=False)
            #evaluate performance
            y_true = pred_holdout["Attrition"]
            y_pred = pred_holdout["Label"]
            prf = sklearn.metrics.precision_recall_fscore_support(y_true, y_pred)
            precision = prf[0][1]
            recall = prf[1][1]
            f1 = prf[2][1]
            accuracy = sklearn.metrics.accuracy_score(y_true, y_pred)
            performance_column = [accuracy, f1, recall, precision]
            #record entry
            results_row = model_column + sample_size_column + score_column + classifier_column + performance_column
            row_dict = {columns[i]:results_row[i] for i in range(len(columns))}
            results = results.append(row_dict, ignore_index=True)
            print(results.shape)
            count+=1
            print(count)
            
        
        
    

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7908,0.5293,0.0,0.0,0.0,0.0,0.0
1,0.7836,0.568,0.0,0.0,0.0,0.0,0.0
2,0.7777,0.5655,0.0,0.0,0.0,0.0,0.0
3,0.7873,0.5591,0.0,0.0,0.0,0.0,0.0
4,0.7888,0.5382,0.0023,1.0,0.0045,0.0036,0.0423
5,0.7816,0.5555,0.0,0.0,0.0,0.0,0.0
6,0.7878,0.5565,0.0,0.0,0.0,0.0,0.0
7,0.7849,0.556,0.0,0.0,0.0,0.0,0.0
8,0.7945,0.5568,0.0,0.0,0.0,0.0,0.0
9,0.7801,0.5568,0.0,0.0,0.0,0.0,0.0


(32, 24)
32
30000
<sdv.tabular.ctgan.CTGAN object at 0x7f756eaba518>


In [25]:
copy = results.copy()

In [29]:
results.shape

(48, 24)

In [39]:
output = baseline()

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9963,0.9965,0.52,0.8125,0.6341,0.6324,0.6484
1,0.9978,0.9973,0.4167,0.7143,0.5263,0.5253,0.5446
2,0.9954,0.995,0.4348,0.625,0.5128,0.5106,0.5191
3,0.9971,0.9964,0.3846,0.5556,0.4545,0.4531,0.4608
4,0.9966,0.9909,0.3333,0.5556,0.4167,0.4151,0.4287
5,0.9978,0.9986,0.3333,0.5,0.4,0.3989,0.4072
6,0.9966,0.9959,0.375,0.6,0.4615,0.4599,0.4728
7,0.9968,0.995,0.5,0.6923,0.5806,0.5791,0.5868
8,0.9971,0.9959,0.5333,0.6154,0.5714,0.57,0.5714
9,0.998,0.9969,0.3,0.75,0.4286,0.4278,0.4736


2


In [40]:
output.shape

(2, 24)

In [41]:
results.shape

(48, 24)

In [42]:
total = pd.concat([results,output])

In [43]:
total.shape

(50, 24)

In [44]:
total.head()

Unnamed: 0,model_name,sample_size,BNLogLikelihood,LogisticDetection,SVCDetection,BinaryDecisionTreeClassifier,BinaryAdaBoostClassifier,BinaryLogisticRegression,BinaryMLPClassifier,MulticlassDecisionTreeClassifier,...,CSTest,KSTest,KSTestExtended,ContinuousKLDivergence,DiscreteKLDivergence,classifier_name,accuracy,f1,recall,precision
0,<sdv.tabular.copulagan.CopulaGAN object at 0x7...,500,-13.520893,0.143415,0.069671,,,,,,...,0.962778,0.833768,0.865435,0.725962,0.906231,lr,0.87415,0.244898,0.148148,0.705882
1,<sdv.tabular.copulagan.CopulaGAN object at 0x7...,500,-13.520893,0.143415,0.069671,,,,,,...,0.962778,0.833768,0.865435,0.725962,0.906231,lr,0.87415,0.244898,0.148148,0.705882
2,<sdv.tabular.copulagan.CopulaGAN object at 0x7...,1000,-13.55895,0.124854,0.052454,,,,,,...,0.96017,0.838715,0.868479,0.752229,0.905957,lr,0.865646,0.070588,0.037037,0.75
3,<sdv.tabular.copulagan.CopulaGAN object at 0x7...,1000,-13.55895,0.124854,0.052454,,,,,,...,0.96017,0.838715,0.868479,0.752229,0.905957,lr,0.865646,0.070588,0.037037,0.75
4,<sdv.tabular.copulagan.CopulaGAN object at 0x7...,10000,-13.6025,0.116728,0.037799,,,,,,...,0.963991,0.838516,0.868908,0.758041,0.91297,lr,0.862245,0.0,0.0,0.0


In [46]:
total.to_csv("performance.csv")

In [47]:
total.head()

Unnamed: 0,model_name,sample_size,BNLogLikelihood,LogisticDetection,SVCDetection,BinaryDecisionTreeClassifier,BinaryAdaBoostClassifier,BinaryLogisticRegression,BinaryMLPClassifier,MulticlassDecisionTreeClassifier,...,CSTest,KSTest,KSTestExtended,ContinuousKLDivergence,DiscreteKLDivergence,classifier_name,accuracy,f1,recall,precision
0,<sdv.tabular.copulagan.CopulaGAN object at 0x7...,500,-13.520893,0.143415,0.069671,,,,,,...,0.962778,0.833768,0.865435,0.725962,0.906231,lr,0.87415,0.244898,0.148148,0.705882
1,<sdv.tabular.copulagan.CopulaGAN object at 0x7...,500,-13.520893,0.143415,0.069671,,,,,,...,0.962778,0.833768,0.865435,0.725962,0.906231,lr,0.87415,0.244898,0.148148,0.705882
2,<sdv.tabular.copulagan.CopulaGAN object at 0x7...,1000,-13.55895,0.124854,0.052454,,,,,,...,0.96017,0.838715,0.868479,0.752229,0.905957,lr,0.865646,0.070588,0.037037,0.75
3,<sdv.tabular.copulagan.CopulaGAN object at 0x7...,1000,-13.55895,0.124854,0.052454,,,,,,...,0.96017,0.838715,0.868479,0.752229,0.905957,lr,0.865646,0.070588,0.037037,0.75
4,<sdv.tabular.copulagan.CopulaGAN object at 0x7...,10000,-13.6025,0.116728,0.037799,,,,,,...,0.963991,0.838516,0.868908,0.758041,0.91297,lr,0.862245,0.0,0.0,0.0
