# PAR (DeepEcho) Implementation

In [None]:
!pip install table_evaluator
!pip install tabulate
!pip install pyathena
!pip install sdv
!pip install deepecho

In [None]:
from pyathena import connect
from pyathena.pandas.cursor import PandasCursor
import pandas as pd
import torch
import numpy as np

In [None]:
class TrafficDataDC:
    
    def __init__(self, date = None, limit = 0):

        self.query = f"select * from uwdatascience2023.full_harddrivetraffic "
        if date:
            self.query += f"WHERE month_end = date('{date}') "
        if limit != 0:
            self.query += "limit "+str(limit)
        
        
        print("Executing \n", self.query)
    
    def run_athena_query(self, print_out=False):
        cursor = connect(
            region_name='us-west-2',
            work_group="primary",
            cursor_class=PandasCursor).cursor()

        self.df = cursor.execute(self.query).as_pandas()

        if print_out:
            print(self.df.to_markdown(index=False))
    
    def fill_data(self):
        self.df = self.df.sort_values(by=['chunk_id','timestamp_nano'])
        self.df['container_group'] = self.df.groupby('chunk_id')['container_group'].ffill()
        self.df['container_encoding'] = self.df.groupby('chunk_id')['container_encoding'].ffill()
        self.df['chunk_size'] = self.df.groupby('chunk_id')['chunk_size'].ffill()
        self.df.dropna(inplace=True)

    def convert_date(self, col = 'timestamp_nano', unit = 'ns', new_col = 'datetime'):
        self.df[new_col] = pd.to_datetime(self.df[col], unit=unit)
        self.df.drop(columns=[col], inplace=True)

    def drop_col(self, cols = ['month_end']):
        self.df.drop(columns=cols, inplace=True)

    def convert_chunk_int(self, col = 'chunk_size'):
        self.df[col] = self.df[col].astype(int)
        
    def process(self):
        self.fill_data()
        self.convert_date()
        self.drop_col()
        self.convert_chunk_int()


In [None]:
s3_traffic = TrafficDataDC(date="2022-01-31", limit = 1000000)
s3_traffic.run_athena_query(print_out=False)

In [None]:
s3_traffic.process()

In [None]:
s3_traffic.df

In [None]:
train_percentage = 0.003
split_index = int(len(s3_traffic.df) * train_percentage)
train_data = s3_traffic.df.iloc[:split_index, :]

In [None]:
train_data

In [None]:
# Deep Echo

from deepecho import PARModel
from deepecho.demo import load_demo

In [None]:
data_types = {
#     'record_id': 'categorical',
#     'location_id': 'categorical',
#     'server_id': 'categorical',
#     'config_id': 'categorical',
#     'disk_id': 'categorical',
#     'disk_capacity_tb': 'categorical',
#     'container_group': 'categorical',
#     'container_encoding': 'categorical',
    'operation': 'categorical',
    'chunk_id': 'ordinal',
    'chunk_size': 'continuous',
    'datetime': 'datetime'
}

In [None]:
entity_columns = ['container_id']

for _, group in list(train_data.groupby(entity_columns))[0:2]:
    display(group)

In [None]:
model = PARModel(epochs=10, cuda=False)

In [None]:
model.fit(
    data=train_data,
    entity_columns=['container_id'],
    #context_columns=['container_group'],#,'chunk_id','disk_capacity_tb'],
    data_types=data_types,
    sequence_index='datetime'
)

In [None]:
# Sample new data
model.sample(num_entities=5)