# CT GAN Implementation

Outputs and Results cleared due to NDA with AWS

In [None]:
!pip install table_evaluator
!pip install tabulate
!pip install pyathena

In [None]:
from pyathena import connect
from pyathena.pandas.cursor import PandasCursor
import pandas as pd
import numpy as np

In [None]:
def run_athena_query(query, print_out=False):
    cursor = connect(
        region_name='us-west-2',
        work_group="primary",
        cursor_class=PandasCursor).cursor()

    df = cursor.execute(query).as_pandas()

    if print_out:
        print(df.to_markdown(index=False))

    return df

In [None]:
query = "select * from AwsDataCatalog.uwdatascience2023.full_harddrivetraffic limit 70000"

In [None]:
athena_df = run_athena_query(query, print_out=False)

In [None]:
athena_df.head()

In [None]:
df = athena_df.sort_values(by=['chunk_id', 'timestamp_nano'])
df['container_group'] = df.groupby('chunk_id')['container_group'].ffill()
df['container_encoding'] = df.groupby('chunk_id')['container_encoding'].ffill()
df['chunk_size'] = df.groupby('chunk_id')['chunk_size'].ffill()
df.dropna(inplace=True)

In [None]:
df['datetime_column'] = pd.to_datetime(df['timestamp_nano'], unit='ns')
df.drop(columns=['timestamp_nano'], inplace=True)

In [None]:
df['chunk_size'] = df['chunk_size'].astype(int)

In [None]:
# Dropping month_end and datetime_column
columns_to_delete = ['month_end']
train_data.drop(columns=columns_to_delete, inplace=True)

In [None]:
# Grouping by 'container_group' and aggregating 'container_id' into a list
grouped_data = train_data.groupby('container_group')['container_id'].agg(list).reset_index()

# Creating a dictionary from the grouped data
container_dict = dict(zip(grouped_data['container_group'], grouped_data['container_id']))

In [None]:
# Extract only the date part and keep the column name and datatype unchanged
train_data['datetime_column'] = train_data['datetime_column'].dt.strftime('%Y-%m-%d')

# Convert the 'datetime_column' to datetime format
train_data['datetime_column'] = pd.to_datetime(train_data['datetime_column'])

In [None]:
df.describe()

In [None]:
features = ['timestamp_nano', 'location_id', 'server_id', 'config_id', 'disk_id', 'container_id', 'container_group', 'container_encoding', 'operation', 'chunk_id', 'chunk_size']

In [None]:
train_percentage = 1
split_index = int(len(df) * train_percentage)
train_data = df.iloc[:split_index, :]

In [None]:
train_data.dtypes

In [None]:
!pip install sdv

In [None]:
from sdv.metadata import SingleTableMetadata

In [None]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(train_data)

In [None]:
metadata

In [None]:
metadata.update_column(
    column_name='container_id',
    sdtype='categorical')

In [None]:
metadata.update_column(
    column_name='chunk_id',
    sdtype='categorical')

In [None]:
metadata.update_column(
    column_name='disk_capacity_tb',
    sdtype='categorical')

In [None]:
from sdv.single_table import CTGANSynthesizer

In [None]:
len(train_data)

In [None]:
synthesizer = CTGANSynthesizer(metadata, 
                               enforce_rounding=False,
                               epochs=30,
                               verbose=True)

In [None]:
synthesizer.fit(train_data)

In [None]:
s_data = synthesizer.sample(num_rows=500, batch_size=10)

In [None]:
s_data

In [None]:
train_data.groupby('container_group').count()

In [None]:
s_data.groupby('container_group').count()

In [None]:
from sdv.evaluation.single_table import run_diagnostic, evaluate_quality
from sdv.evaluation.single_table import get_column_plot

# 1. perform basic validity checks
diagnostic = run_diagnostic(train_data, s_data, metadata)

# 2. measure the statistical similarity
quality_report = evaluate_quality(train_data, s_data, metadata)

In [None]:
fig = get_column_plot(
    real_data=train_data,
    synthetic_data=s_data,
    metadata=metadata,
    column_name='container_group'
)
    
fig.show()

In [None]:
fig = get_column_plot(
    real_data=train_data,
    synthetic_data=s_data,
    metadata=metadata,
    column_name='config_id'
)
    
fig.show()

In [None]:
fig = get_column_plot(
    real_data=train_data,
    synthetic_data=s_data,
    metadata=metadata,
    column_name='disk_capacity_tb'
    
)
fig.update_layout(title='Real vs. Synthetic Data for Disk_capacity_TB')    
fig.show()

In [None]:
# plot the data
fig_operation = get_column_plot(
    real_data=train_data,
    synthetic_data=s_data,
    metadata=metadata,
    column_name='operation'
)

fig_operation.update_layout(xaxis=dict(tickangle=0))
fig_operation.update_layout(xaxis=dict(tickfont=dict(size=17)))
fig_operation.show()

In [None]:
# Assuming you are using Plotly for visualization
import plotly.graph_objects as go
from sdv.evaluation.single_table import get_column_plot

fig_operation = get_column_plot(
    real_data=train_data,
    synthetic_data=s_data,
    metadata=metadata,
    column_name='operation'
)

# Extracting the color of the bars
default_color = fig_operation.data[0].marker.color

print("Default color of the bar chart:", default_color)

In [None]:
fig = get_column_plot(
    real_data=train_data,
    synthetic_data=s_data,
    metadata=metadata,
    column_name='chunk_size'
)    
fig.show()

In [None]:
!pip install sdmetrics

In [None]:
from sdmetrics.reports.single_table import QualityReport

In [None]:
report = QualityReport()

In [None]:
train_data_copy = train_data

In [None]:
train_data_copy.columns

In [None]:
s_data_copy = s_data

In [None]:
metadata_copy = SingleTableMetadata()
metadata_copy.detect_from_dataframe(train_data_copy)

In [None]:
metadata_copy.update_column(
    column_name='container_id',
    sdtype='categorical')

In [None]:
metadata_copy.update_column(
    column_name='chunk_id',
    sdtype='categorical')

In [None]:
metadata_copy.update_column(
    column_name='disk_capacity_tb',
    sdtype='categorical')

In [None]:
metadata_copy

In [None]:
report.generate(train_data_copy.sample(n=2000), s_data_copy, metadata_copy.to_dict())

In [None]:
report.get_properties()

In [None]:
report.get_details(property_name='Column Shapes')

In [None]:
from sdmetrics.reports.single_table import QualityReport

In [None]:
fig = report.get_visualization(property_name='Column Shapes')

# Remove the column 'month_end' from the x-axis
filtered_data = [trace for trace in fig.data if trace.name != 'month_end']

# Update the data in the figure
fig.data = filtered_data

# Update the title of the plot
fig.update_layout(title='Data Quality CTGAN: Column Shapes (Average Score = 0.69)')

# Find the index of the trace corresponding to TVComplement
tvcomplement_index = None
for i, trace in enumerate(fig.data):
    if trace.name == 'TVComplement':
        tvcomplement_index = i
        break

# Update the color of the bars represented by TVComplement
if tvcomplement_index is not None:
    fig.data[tvcomplement_index].marker.color = '#00e1c9'

fig.show()



Conditional Smapling for the sythetic data

In [None]:
# Conditional Sampling
from sdv.sampling import Condition
from datetime import datetime, timedelta
from datetime import timedelta
import random

In [None]:
# wrapper function to pass al the arguments in
def generate_conditions(disk_capacity, container_groups, num_rows):
    conditions = []
    for container_group, container_id_types in container_groups.items():
        container_ids = random.sample(container_dict[container_group], container_id_types)
            
        conditions.extend([Condition(num_rows=num_rows, column_values={'disk_capacity_tb': disk_capacity, 'container_group': container_group,'container_id': container_id})
                            for container_id in container_ids])

    return conditions

In [None]:
train_data['datetime_column'].describe()

In [None]:
train_data['disk_capacity_tb'].describe()

In [None]:
list(train_data['container_group'].unique())

In [None]:
# defining variables to be passed to the synthesizer
start_date = datetime.strptime('2022-02-01', '%Y-%m-%d')
end_date = datetime.strptime('2022-02-02', '%Y-%m-%d')
disk_capacity = 16
# this translates to I want 2 container_ids of type X and 3 container_ids of type Y
container_groups = {'02892102A8F17B5A551466B444222F4C3D9A399F':2, 'CC21F742BC91C1A0ED11A719D5C2CE74690BCD44':3}
num_rows = 100

In [None]:
conditions = generate_conditions(disk_capacity, container_groups, num_rows)

In [None]:
len(conditions)

In [None]:
conditional_synthetic_data = synthesizer.sample_from_conditions(conditions=conditions)

In [None]:
conditional_synthetic_data

In [None]:
report.get_details(property_name='Column Shapes')