In [53]:
from flaml import AutoML
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_squared_error as mse
from sdv.tabular.copulagan import CopulaGAN
from scipy.stats import ks_2samp

In [39]:
# Initialize an AutoML instance
automl = AutoML()
# Specify automl goal and constraint
automl_settings = {
    "time_budget": 30,  # in seconds
    "metric": 'mse',
    'ensemble': True,
    'model_history': True,
    "task": 'regression',
    "log_file_name": "california.log",
}

In [40]:
X, y = fetch_california_housing(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=440)

In [41]:
automl.fit(X_train, y_train, **automl_settings)

[flaml.automl: 12-01 13:43:42] {2599} INFO - task = regression
[flaml.automl: 12-01 13:43:42] {2601} INFO - Data split method: uniform
[flaml.automl: 12-01 13:43:42] {2604} INFO - Evaluation method: holdout
[flaml.automl: 12-01 13:43:42] {2726} INFO - Minimizing error metric: mse
[flaml.automl: 12-01 13:43:42] {2870} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth']
[flaml.automl: 12-01 13:43:42] {3166} INFO - iteration 0, current learner lgbm
  sampled_y_train = self.y_train[:sample_size]
[flaml.automl: 12-01 13:43:42] {3296} INFO - Estimated sufficient time budget=280s. Estimated necessary time budget=2s.
[flaml.automl: 12-01 13:43:42] {3343} INFO -  at 0.1s,	estimator lgbm's best error=0.9431,	best estimator lgbm's best error=0.9431
[flaml.automl: 12-01 13:43:42] {3166} INFO - iteration 1, current learner lgbm
  sampled_y_train = self.y_train[:sample_size]
[flaml.automl: 12-01 13:43:42] {3343} INFO -  at 0.1s,	estimator lgbm's best e

In [42]:
joblib.dump(automl, 'regression.sav')

['regression.sav']

In [43]:
y_pred = automl.predict(X_test)
mse(y_test, y_pred)

0.2084564226625747

In [48]:
df = pd.concat([X, y], axis=1)
synth = CopulaGAN()
synth.fit(df)

In [49]:
synth.save('new_data_generator.sav')

In [50]:
sample = synth.sample(200)

  data.iloc[:, 1] = np.argmax(column_data[:, 1:], axis=1)
  data.iloc[:, 1] = np.argmax(column_data[:, 1:], axis=1)
  data.iloc[:, 1] = np.argmax(column_data[:, 1:], axis=1)
  data.iloc[:, 1] = np.argmax(column_data[:, 1:], axis=1)
  data.iloc[:, 1] = np.argmax(column_data[:, 1:], axis=1)
  data.iloc[:, 1] = np.argmax(column_data[:, 1:], axis=1)
  data.iloc[:, 1] = np.argmax(column_data[:, 1:], axis=1)
  data.iloc[:, 1] = np.argmax(column_data[:, 1:], axis=1)
  data.iloc[:, 1] = np.argmax(column_data[:, 1:], axis=1)


In [55]:
for column in df.columns:
    print(ks_2samp(df[column], sample[column]).pvalue)

0.03490354775840576
0.002394563536377684
7.959385269700619e-10
4.869957033247147e-125
5.9085637611082335e-05
4.336685154856057e-21
0.015344257890275067
0.014131388133977841
0.23996903856240903


In [57]:
df_test = pd.concat([X_test, y_test], axis=1)
df_test.to_csv('df_test.csv')