In [None]:
#!pip install ydata-synthetic==1.3.1

In [None]:
# !pip install tabulate
# !pip install pyathena

In [None]:
# Importing the necessay modules
import pandas as pd
import matplotlib.pyplot as plt
from ydata_synthetic.synthesizers.timeseries import TimeSeriesSynthesizer
from ydata_synthetic.synthesizers import ModelParameters, TrainParameters

## Get the data

In [None]:
from pyathena import connect
from pyathena.pandas.cursor import PandasCursor
import pandas as pd
import numpy as np

In [None]:
class TrafficDataDC:
    
    def __init__(self, date = None, limit = 0):

        self.query = f"select * from uwdatascience2023.full_harddrivetraffic "
        if date:
            self.query += f"WHERE month_end = date('{date}') "
        if limit != 0:
            self.query += "limit "+str(limit)
        
        
        print("Executing \n", self.query)
    
    def run_athena_query(self, print_out=False):
        cursor = connect(
            region_name='us-west-2',
            work_group="primary",
            cursor_class=PandasCursor).cursor()

        self.df = cursor.execute(self.query).as_pandas()

        if print_out:
            print(self.df.to_markdown(index=False))
    
    def fill_data(self):
        self.df = self.df.sort_values(by=['chunk_id','timestamp_nano'])
        self.df['container_group'] = self.df.groupby('chunk_id')['container_group'].ffill()
        self.df['container_encoding'] = self.df.groupby('chunk_id')['container_encoding'].ffill()
        self.df['chunk_size'] = self.df.groupby('chunk_id')['chunk_size'].ffill()
        self.df.dropna(inplace=True)

    def convert_date(self, col = 'timestamp_nano', unit = 'ns', new_col = 'datetime'):
        self.df[new_col] = pd.to_datetime(self.df[col], unit=unit)
        self.df.drop(columns=[col], inplace=True)

    def drop_col(self, cols = ['month_end']):
        self.df.drop(columns=cols, inplace=True)

    def convert_chunk_int(self, col = 'chunk_size'):
        self.df[col] = self.df[col].astype(int)
        
    def process(self):
        self.fill_data()
        self.convert_date()
        self.drop_col()
        self.convert_chunk_int()

In [None]:
s3_traffic = TrafficDataDC(date="2022-02-28", limit = 200000)
s3_traffic.run_athena_query(print_out=False)

In [None]:
s3_traffic.process()

In [None]:
len(s3_traffic.df)

## Training DoppelGanger

In [None]:
df = s3_traffic.df.copy()
df.drop(['datetime'], axis=1, inplace = True)
df.shape

In [None]:
df = df.iloc[:33600//10]

In [None]:
#df.shape

In [None]:
df.head(20)

In [None]:
df.dtypes

In [None]:
df['container_id'] = df['container_id'].astype(str)
df['operation'] = df['operation'].astype(str)
df['chunk_id'] = df['chunk_id'].astype(str)

In [None]:
numerical_cols = ['chunk_size']
categorical_cols = [col for col in df.columns if col not in numerical_cols]

In [None]:
# Defining model and training parameters
model_args = ModelParameters(batch_size=100,
                             lr=0.001,
                             betas=(0.2, 0.9),
                             latent_dim=20,
                             gp_lambda=1,
                             pac=1)

train_args = TrainParameters(epochs=400,
                             sequence_length=20,
                             sample_length=10,
                             rounds=1,
                             measurement_cols=['chunk_size'])

In [None]:
# Training the DoppelGANger synthesizer
model_dop_gan = TimeSeriesSynthesizer(modelname='doppelganger',
                                      model_parameters=model_args)


In [None]:
model_dop_gan.fit(df, train_args, num_cols=numerical_cols, 
                  cat_cols=categorical_cols)