In [None]:
#!pip install ydata_synthetic

In [None]:
# !pip install tabulate
# !pip install pyathena
# !pip install ctgan

In [None]:
from pyathena import connect
from pyathena.pandas.cursor import PandasCursor
import pandas as pd
import numpy as np

In [None]:
class TrafficDataDC:
    
    def __init__(self, date = None, limit = 0):

        self.query = f"select * from uwdatascience2023.full_harddrivetraffic "
        if date:
            self.query += f"WHERE month_end = date('{date}') "
        if limit != 0:
            self.query += "limit "+str(limit)
        
        
        print("Executing \n", self.query)
    
    def run_athena_query(self, print_out=False):
        cursor = connect(
            region_name='us-west-2',
            work_group="primary",
            cursor_class=PandasCursor).cursor()

        self.df = cursor.execute(self.query).as_pandas()

        if print_out:
            print(self.df.to_markdown(index=False))
    
    def fill_data(self):
        self.df = self.df.sort_values(by=['chunk_id','timestamp_nano'])
        self.df['container_group'] = self.df.groupby('chunk_id')['container_group'].ffill()
        self.df['container_encoding'] = self.df.groupby('chunk_id')['container_encoding'].ffill()
        self.df['chunk_size'] = self.df.groupby('chunk_id')['chunk_size'].ffill()
        self.df.dropna(inplace=True)

    def convert_date(self, col = 'timestamp_nano', unit = 'ns', new_col = 'datetime'):
        self.df[new_col] = pd.to_datetime(self.df[col], unit=unit)
        self.df.drop(columns=[col], inplace=True)

    def drop_col(self, cols = ['month_end']):
        self.df.drop(columns=cols, inplace=True)

    def convert_chunk_int(self, col = 'chunk_size'):
        self.df[col] = self.df[col].astype(int)
        
    def process(self):
        self.fill_data()
        self.convert_date()
        self.drop_col()
        self.convert_chunk_int()

In [None]:
s3_traffic = TrafficDataDC(date="2022-02-28", limit = 10000)
s3_traffic.run_athena_query(print_out=False)

In [None]:
s3_traffic.process()

In [None]:
len(s3_traffic.df)

In [None]:
s3_traffic.df

In [None]:
# Importing the required libs for the exercise

from os import path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
from ydata_synthetic.preprocessing.timeseries import processed_stock
from ydata_synthetic.synthesizers.timeseries import TimeSeriesSynthesizer

In [None]:
# Specific to TimeGANs
seq_len = 24
n_seq = 6
hidden_dim = 24
gamma = 1

noise_dim = 32
dim = 128
batch_size = 128

log_step = 100
learning_rate = 5e-4
# For quick prototyping
# epochs=50000
epochs = 10

gan_args = ModelParameters(
    batch_size=batch_size, lr=learning_rate, noise_dim=noise_dim, layers_dim=dim
)

train_args = TrainParameters(
    epochs=epochs, sequence_length=seq_len, number_sequences=n_seq
)

In [None]:
df = s3_traffic.df.copy()
len(df)

In [None]:
train_percentage = 0.70
split_index = int(len(df) * train_percentage)
train_data = df.iloc[:split_index, :]

In [None]:
df.drop(['datetime'], axis=1, inplace = True)

In [None]:
num_cols = ['chunk_size']
cat_cols = ['location_id', 'server_id', 'config_id', 'disk_id', 'container_id', 'container_group', 'container_encoding', 'operation', 'chunk_id']

In [None]:
cols = list(df.columns)

In [None]:
df.shape

In [None]:
if path.exists("timegan_temp.pkl"):
    synth = TimeSeriesSynthesizer.load("timegan_temp.pkl")
else:
    synth = TimeSeriesSynthesizer(modelname="timegan", model_parameters=gan_args)
    synth.fit(df, train_args, num_cols=cols)
    synth.save("timegan_temp.pkl")