# Creating new `abalone` data using `CTGAN`
>__NOTE:__ Recommend using the _Python 3 (Data Science)_ kernel, using an _ml.m5.4xlarge (16vCPU + 64MB)_ Instance Type. However, this will incur additional AWS usage costs.

## Install `ctgan`

In [None]:
%%capture
!pip install ctgan

## Load the Required Libraries

In [None]:
import io
import boto3
import warnings
import pandas as pd
from time import gmtime, strftime

warnings.filterwarnings("ignore")
s3 = boto3.client("s3")
model_name = "abalone"
column_names = [
    "sex",
    "length",
    "diameter",
    "height",
    "whole_weight",
    "shucked_weight",
    "viscera_weight",
    "shell_weight",
    "rings"
]

## Load the "raw" data

In [None]:
data_bucket = f"""{boto3.client("ssm").get_parameter(Name="AirflowDataBucket")["Parameter"]["Value"]}"""
raw_data_key = f"{model_name}_data/raw/abalone.data"
new_data_key = f"{model_name}_data/new/abalone.{strftime('%Y%m%d%H%M%S', gmtime())}"
s3_object = s3.get_object(Bucket=data_bucket, Key=raw_data_key)
raw_df = pd.read_csv(io.BytesIO(s3_object["Body"].read()), encoding="utf8", names=column_names)

## Fit the CTGAN Model on the `sex` target label

>__NOTE:__ Fitting the `ctgan` model can up to 5 minutes, depending on the Kernel compute resources.

In [None]:
from ctgan import CTGAN

ctgan = CTGAN()
ctgan.fit(raw_df, ["sex"])

## Generate `100` samples of "new" data
>__NOTE:__ `100` new samples are used to realistially simulate the potential amount of new daily survey data

In [None]:
samples = ctgan.sample(100)

## Compare Datasets
### `raw` dataset

In [None]:
raw_df.describe()

### `new` dataset

In [None]:
samples.describe()

## Upload the new data to test the Airflow DAG

In [None]:
samples.to_csv(f"s3://{data_bucket}/{new_data_key}", header=False, index=False)