In [10]:
from pycaret.classification import * # Preprocessing, modelling, interpretation, deployment...
import pandas as pd # Basic data manipulation
import dabl as db # Summary plot
from sklearn.model_selection import train_test_split # Data split
from sdv.tabular import CopulaGAN, GaussianCopula, CTGAN, TVAE # Synthetic data
from sdv.evaluation import evaluate # Evaluate synthetic data
import sdmetrics

In [11]:
# Read and output the top 5 rows
original_data = pd.read_csv('KaggleV2-May-2016.csv')
original_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PatientId       110527 non-null  float64
 1   AppointmentID   110527 non-null  int64  
 2   Gender          110527 non-null  object 
 3   ScheduledDay    110527 non-null  object 
 4   AppointmentDay  110527 non-null  object 
 5   Age             110527 non-null  int64  
 6   Neighbourhood   110527 non-null  object 
 7   Scholarship     110527 non-null  int64  
 8   Hipertension    110527 non-null  int64  
 9   Diabetes        110527 non-null  int64  
 10  Alcoholism      110527 non-null  int64  
 11  Handcap         110527 non-null  int64  
 12  SMS_received    110527 non-null  int64  
 13  No-show         110527 non-null  object 
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB


In [12]:
def extract_features(dataset):
    # get month, day name and hour from Start Time after convert
    dataset['Appointment_year'] = dataset['AppointmentDay'].dt.year
    dataset['Appointment_month'] = dataset['AppointmentDay'].dt.month
    dataset['Appointment_day'] = dataset['AppointmentDay'].dt.day
    dataset['Appointment_day_name'] = dataset['AppointmentDay'].dt.day_name()
    #appointment hour is always 0 so we leave it out
    
    # get month and day name and hour from Start Time after convert
    dataset['Register_year'] = dataset['RegisterDay'].dt.year
    dataset['Register_month'] = dataset['RegisterDay'].dt.month
    dataset['Register_day'] = dataset['RegisterDay'].dt.day
    dataset['Register_day_name'] = dataset['RegisterDay'].dt.day_name()
    dataset['Register_hour'] = dataset['RegisterDay'].dt.hour
    
    dataset['waiting_time'] = (dataset['AppointmentDay']-dataset['RegisterDay']).dt.days
    
    dataset.drop('AppointmentDay', axis=1, inplace=True)
    dataset.drop('RegisterDay', axis=1, inplace=True)
    
    
def convert_datetime(dataset):
    dataset.rename(columns={'Handcap':'Handicap'},inplace=True)
    dataset.rename(columns={"ScheduledDay":"RegisterDay"},inplace=True)
    dataset['AppointmentDay'] = pd.to_datetime(dataset['AppointmentDay']).dt.tz_localize(None)
    dataset['RegisterDay'] = pd.to_datetime(dataset['RegisterDay']).dt.tz_localize(None)
convert_datetime(original_data)
extract_features(original_data)

In [13]:
# Split real data into training + test set
train, test, target_train, target_test = train_test_split(original_data.drop("No-show", axis = 1), original_data["No-show"], test_size = 0.4, random_state = 42)
train["No-show"] = target_train
test["No-show"] = target_test

In [15]:
# Target feature
target = "No-show"

# Continuous/numeric features
cont_feats = ["Age", "Appointment_year",  "Appointment_month", "Appointment_day",
              "Register_year", "Register_month", "Register_day",
              "Register_hour", "waiting_time"]

# Ordinal features
ord_feats = {}

# Categorical geatures
cat_feats = ["Gender", "Scholarship", "Hipertension", 
             "Diabetes", "Alcoholism", "Handicap", "SMS_received", 
             "Appointment_day_name", "Register_day_name"]

# Features to ignore
ignore = ["PatientId", "AppointmentID"]#ignore id variables and datetime type columns

In [17]:
setup(train.sample(frac=1), 
      target = target, 
      test_data = test,
      fold_strategy = "kfold",
      numeric_features = cont_feats,
      categorical_features = cat_feats,
      ordinal_features = ord_feats,
      ignore_features = ignore,
      normalize = True,
      normalize_method = "zscore",
      silent = True, verbose = False)
print("setup complete!")

setup complete!


In [25]:
models = [CopulaGAN, GaussianCopula, CTGAN, TVAE]
sample_sizes = [0, 500, 1000, 10_000, 39_362, 70_000, 100_000]
metric_classes = sdmetrics.single_table.SingleTableMetric.get_subclasses()
classifiers = ["catboost", "dt"]
performance = ["f1","accuracy", "recall", "precision"]


In [31]:
model_name = ["model_name"]
sample_size_name = ["sample_size"]
metric_class_names = [str(metric_name) for metric_name in metric_classes]
classifier_name = ["classifier_name"]
performance_names = ["f1","accuracy", "recall", "precision"]
columns = model_name + sample_size_name + metric_class_names + classifier_name + performance_names
results = pd.DataFrame(columns=columns)
results

Unnamed: 0,model_name,sample_size,BNLogLikelihood,LogisticDetection,SVCDetection,BinaryDecisionTreeClassifier,BinaryAdaBoostClassifier,BinaryLogisticRegression,BinaryMLPClassifier,MulticlassDecisionTreeClassifier,...,CSTest,KSTest,KSTestExtended,ContinuousKLDivergence,DiscreteKLDivergence,classifier_name,f1,accuracy,recall,precision


In [29]:
metric_class_names

['BNLogLikelihood',
 'LogisticDetection',
 'SVCDetection',
 'BinaryDecisionTreeClassifier',
 'BinaryAdaBoostClassifier',
 'BinaryLogisticRegression',
 'BinaryMLPClassifier',
 'MulticlassDecisionTreeClassifier',
 'MulticlassMLPClassifier',
 'LinearRegression',
 'MLPRegressor',
 'GMLogLikelihood',
 'CSTest',
 'KSTest',
 'KSTestExtended',
 'ContinuousKLDivergence',
 'DiscreteKLDivergence']

In [None]:
def make_score_column(scores):
    score_column = []
    for metric in metric_class_names:
        if metric in score_aggregate.columns:
            score_column.append(score_aggregate[metric])
        else:
            score_column.append(None)

In [34]:
count = 0
for initialize_generative_model in models:
    model_column = [str(model)]
    model = initialize_generative_model()
    trained_model = model.fit(data=train)#make sure you have the right column
    for sample_size in sample_sizes:
        
        sample_size_column = [sample_size]
        synthetic_data = trained_model.sample(sample_size)
        score_aggregate = score(aggregate())
        score_column = make_score_column(score_aggregate)
        for classifier_name in classifier_names:
            classifier_column = [classifier_name]
            classifier = create_model(classifier_name) # Create the catboost classifier
            pred_holdout = predict_model(classifier)
            performance_column = list(pred_holdout["F1", "Accuracy", "Recall", "Prec."])
            results_row = model_column + sample_size_column + score_column + classifier_column + performance_column
            results.append(results_row)
            count+=1
            print(count)
            
        
        
    

KeyboardInterrupt: 