In [12]:
from pycaret.classification import * # Preprocessing, modelling, interpretation, deployment...
import pandas as pd # Basic data manipulation
from sklearn.model_selection import train_test_split # Data split
from sdv.tabular import CopulaGAN, GaussianCopula, CTGAN, TVAE # Synthetic data
from sdv.evaluation import evaluate # Evaluate synthetic data
import sdmetrics

In [13]:
import sklearn
print(sklearn.__version__)

0.23.2


In [17]:
original_data = pd.read_csv("HR Employee Attrition.csv")
original_data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [18]:
# Split real data into training + test set
train, test, target_train, target_test = train_test_split(original_data.drop("Attrition", axis = 1), original_data["Attrition"], test_size = 0.4, random_state = 42)
train["Attrition"] = target_train
test["Attrition"] = target_test

In [19]:
attrition_yes_size = train[train["Attrition"]=="Yes"].size
attrition_no_size = train[train["Attrition"]=="No"].size

print(attrition_yes_size)
print(attrition_no_size)
attrition_imbalance = attrition_no_size - attrition_yes_size
print(attrition_imbalance)

5460
25410
19950


In [20]:
# Target feature
target = "Attrition"

# Continuous/numeric features
cont_feats = ["DistanceFromHome", "HourlyRate",  "DailyRate", "MonthlyIncome",
              "MonthlyRate", "NumCompaniesWorked", "PercentSalaryHike",
              "TotalWorkingYears", "YearsAtCompany", "YearsInCurrentRole",
              "YearsWithCurrManager", "TrainingTimesLastYear", "YearsSinceLastPromotion"]

# Ordinal features
ord_feats = {"StockOptionLevel" : ["0", "1", "2", "3"],
             "EnvironmentSatisfaction" : ["1", "2", "3", "4"],
             "JobInvolvement" : ["1", "2", "3", "4"],
             "JobSatisfaction" : ["1", "2", "3", "4"],
             "Education" : ["1", "2", "3", "4", "5"],
             "PerformanceRating" : ["3", "4"],
             "RelationshipSatisfaction" : ["1", "2", "3", "4"],
             "WorkLifeBalance" : ["1", "2", "3", "4"]}

# Categorical geatures
cat_feats = ["BusinessTravel", "Department", "EducationField", 
             "JobRole", "Gender", "JobLevel", "JobRole", 
             "MaritalStatus", "OverTime", "WorkLifeBalance"]

# Features to ignore
ignore = ["EmployeeNumber", "StandardHours", "EmployeeCount", "Over18"]

In [21]:
setup(train.sample(frac=1), 
      target = target, 
      test_data = test,
      fold_strategy = "kfold",
      numeric_features = cont_feats,
      categorical_features = cat_feats,
      ordinal_features = ord_feats,
      ignore_features = ignore,
      normalize = True,
      normalize_method = "zscore",
      silent = True, verbose = False)
print("setup complete!")

setup complete!


In [22]:
models = [CopulaGAN, GaussianCopula, CTGAN, TVAE]
sample_sizes = [0, 500, 1000, 10_000, 39_362, 70_000, 100_000]
metric_classes = sdmetrics.single_table.SingleTableMetric.get_subclasses()
classifiers = ["catboost", "dt"]
performance = ["f1","accuracy", "recall", "precision"]


In [25]:
model_name = ["model_name"]
sample_size_name = ["sample_size"]
metric_class_names = [str(metric_name) for metric_name in metric_classes]
classifier_name = ["classifier_name"]
performance_names = ["f1","accuracy", "recall", "precision"]
columns = model_name + sample_size_name + metric_class_names + classifier_name + performance_names
results = pd.DataFrame(columns=columns)
results

Unnamed: 0,model_name,sample_size,BNLogLikelihood,LogisticDetection,SVCDetection,BinaryDecisionTreeClassifier,BinaryAdaBoostClassifier,BinaryLogisticRegression,BinaryMLPClassifier,MulticlassDecisionTreeClassifier,...,CSTest,KSTest,KSTestExtended,ContinuousKLDivergence,DiscreteKLDivergence,classifier_name,f1,accuracy,recall,precision


In [26]:
def make_score_column(scores):
    score_column = []
    for metric in metric_class_names:
        if metric in score_aggregate.columns:
            score_column.append(score_aggregate[metric])
        else:
            score_column.append(None)

In [27]:
def make_CopulaGAN(data):
    model = CopulaGAN()
    model.fit(data)
    return model
def make_GaussianCopula(data):
    model = GaussianCopula()
    model.fit(data)
    return model
def make_CTGAN(data):
    model = CTGAN()
    model.fit(data)
    return model
def make_TVAE(data):
    model = TVAE()
    model.fit(data)
    return model

In [None]:
copula_gan_model = make_CopulaGAN(train)

In [None]:
copula_gan_model.save('CopulaGAN.pkl')

In [None]:
guassian_copula_model = make_GaussianCopula(train)

In [None]:
guassian_copula_model.save('GuassianCopulaModel.pkl')

In [None]:
ctgan_model = make_CTGAN(train)

In [None]:
ctgan_model.save("CTGAN-Model.pkl")

In [None]:
tvae_model = make_TVAE(train)

In [None]:
tvae_model.save("TVAE-Model.pkl")

In [16]:
count = 0
for initialize_generative_model in models:
    model_column = [str(initialize_generative_model)]
    model = initialize_generative_model()
    trained_model = model.fit(data=train)#make sure you have the right column
    for sample_size in sample_sizes:
        
        sample_size_column = [sample_size]
        synthetic_data = trained_model.sample(sample_size)
        score_aggregate = score(aggregate())
        score_column = make_score_column(score_aggregate)
        for classifier_name in classifier_names:
            classifier_column = [classifier_name]
            classifier = create_model(classifier_name) # Create the catboost classifier
            pred_holdout = predict_model(classifier)
            performance_column = list(pred_holdout["F1", "Accuracy", "Recall", "Prec."])
            results_row = model_column + sample_size_column + score_column + classifier_column + performance_column
            results.append(results_row)
            count+=1
            print(count)
            
        
        
    

KeyboardInterrupt: 