# Prototype: Generating Fake Abalone Data (CTGAN)

><div class="alert alert-block alert-info"><b>NOTE: </b>Recommend using an <em>ml.m5.4xlath (16vCPU + 64MB)</em> instance type and, <em>Python 3 (Data Science)</em> kernel to train the <b>CTGAN</b> model.</div> 

In [1]:
# Install CTGAN
#!pip install ctgan

In [2]:
import boto3
import io
import pandas as pd
import warnings

warnings.filterwarnings('ignore')
s3 = boto3.client('s3')

# 'raw' data column names
names = [
    'sex',
    'length',
    'diameter',
    'height',
    'whole_weight',
    'shucked_weight',
    'viscera_weight',
    'shell_weight',
    'rings'
]

# Location of the 'raw' data
bucket = 'data-us-east-2-500842391574'
key = 'input/raw/abalone.csv'
obj = s3.get_object(Bucket=bucket, Key=key)
raw_data = pd.read_csv(io.BytesIO(obj['Body'].read()), encoding='utf8', names=names)
raw_data.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [3]:
# Fit the CTGAN model
from ctgan import CTGANSynthesizer

ctgan = CTGANSynthesizer()
ctgan.fit(raw_data, ['sex'])

In [4]:
# Generate 300 samples from the CTGAN model
samples = ctgan.sample(300)

In [5]:
# Compare the raw data
raw_data.describe()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


In [6]:
# Compare the sample data
samples.describe()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
count,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0
mean,0.505286,0.407042,0.13727,0.920711,0.324504,0.21219,0.262217,10.05
std,0.138321,0.102988,0.042552,0.540203,0.214242,0.122779,0.151058,3.502627
min,0.080966,0.113664,0.0156,-0.008829,-0.038991,-0.009022,0.016719,4.0
25%,0.425663,0.354496,0.1079,0.515029,0.161297,0.108346,0.134884,8.0
50%,0.538431,0.425371,0.144735,0.858251,0.328957,0.220025,0.268548,9.0
75%,0.612553,0.495828,0.166159,1.282959,0.457227,0.305445,0.358566,12.0
max,0.739048,0.5806,0.223906,2.490661,1.125791,0.557467,0.856696,22.0


In [7]:
# Save the samples as fake abalone data
samples.to_csv('fake-abalone.csv', index=False)