In [1]:
from pycaret.classification import * # Preprocessing, modelling, interpretation, deployment...
import pandas as pd # Basic data manipulation
from sklearn.model_selection import train_test_split # Data split
from sdv.tabular import CopulaGAN, GaussianCopula, CTGAN, TVAE # Synthetic data
from sdv.evaluation import evaluate # Evaluate synthetic data
import sdmetrics

In [2]:
import sys
sys.path.append('sd_benchmarking/')

In [3]:
import utils

In [5]:
utils.split_data(dataset_path ="insurance.csv", output_directory="regression_data/", train_filename = "train.csv", test_filename = "test.csv", target_name="charges")

In [2]:
df = pd.read_csv("regression_data/train.csv")

In [3]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,25,59,female,27.72,3,no,southeast,14001.1338
1,336,60,male,25.74,0,no,southeast,12142.5786
2,47,28,female,34.77,0,no,northwest,3556.9223
3,106,19,female,28.4,1,no,southwest,2331.519
4,1269,45,male,27.5,3,no,southwest,8615.3
5,55,58,male,36.955,2,yes,northwest,47496.49445
6,649,58,female,32.965,0,no,northeast,12430.95335
7,213,34,female,26.73,1,no,southeast,5002.7827
8,120,44,male,37.1,2,no,southwest,7740.337
9,697,41,male,35.75,1,yes,southeast,40273.6455


In [4]:
df.dtypes

Unnamed: 0      int64
age             int64
sex            object
bmi           float64
children        int64
smoker         object
region         object
charges       float64
dtype: object

In [8]:
data = df
gaussian_copula = GaussianCopula()
gaussian_copula.fit(data)
gaussian_copula.save('GuassianCopulaModel.pkl')

In [5]:
data = df
tvae = TVAE()
tvae.fit(data)
tvae.save('TvaeModel.pkl')

In [6]:
data = df
ctgan = CTGAN()
ctgan.fit(data)
ctgan.save('CtganModel.pkl')

In [None]:
import task
import benchmark

In [None]:
results_output_path = "results/"
task_output_path = "tasks/"
path_to_generators = "regression_generators/"
tasks = task.create_tasks(train_dataset="regression_data/train.csv",
                    test_dataset="regression_data/test.csv", target="charges",
                    path_to_generators = path_to_generators, pycaret_models=["lr", "ridge", "kr"],
                    task_sampling_method="uniform", run_num=1, output_dir=task_output_path, is_regression=True)

In [None]:
# run benchmark on tasks
result_df, failed_tasks = benchmark.benchmark(tasks, agnostic_metrics=False, output_path=results_output_path)
print("failed tasks:")
for each in failed_tasks:
    print(each)