In [None]:
!pip install tabulate
!pip install pyathena

In [None]:
from pyathena import connect
from pyathena.pandas.cursor import PandasCursor
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

In [None]:
class TrafficDataDC:
    
    def __init__(self, date = None, limit = 0):

        self.query = f"select disk_capacity_tb, container_group, chunk_id, timestamp_nano from uwdatascience2023.full_harddrivetraffic "
        if date:
            self.query += f"WHERE month_end = date('{date}') "
        if limit != 0:
            self.query += "limit "+str(limit)
        
        
        print("Executing \n", self.query)
    
    def run_athena_query(self, print_out=False):
        cursor = connect(
            region_name='us-west-2',
            work_group="primary",
            cursor_class=PandasCursor).cursor()

        self.df = cursor.execute(self.query).as_pandas()

        if print_out:
            print(self.df.to_markdown(index=False))
        print(f"length of DF : {len(self.df)}")
    
    def fill_data(self):        
        self.df['container_group'] = self.df.groupby('chunk_id')['container_group'].ffill()
        # self.df['container_encoding'] = self.df.groupby('chunk_id')['container_encoding'].ffill()
        # self.df['chunk_size'] = self.df.groupby('chunk_id')['chunk_size'].ffill()
        self.df = self.df.sort_values(by=['timestamp_nano'])
        self.df.dropna(inplace=True)

    def convert_date(self, col = 'timestamp_nano', unit = 'ns', new_col = 'datetime'):
        self.df[new_col] = pd.to_datetime(self.df[col], unit=unit)
        self.df.drop(columns=[col], inplace=True)

    def drop_col(self, cols = []):
        self.df.drop(columns=cols, inplace=True)

    def convert_chunk_int(self, col = 'chunk_size'):
        self.df[col] = self.df[col].astype(int)
        
    def filter_rows(self, date):
        cutoff_date = pd.to_datetime(date)
        # Filter rows where the date is less than or equal to the cutoff_date
        self.df = self.df[self.df['datetime'] >= cutoff_date]
        
    def process(self):
        self.fill_data()
        self.convert_date()
        # self.drop_col()
        self.filter_rows('2021-01-01')
        # self.convert_chunk_int()

In [None]:
from tqdm import tqdm
import os

dates = [
'2022-01-31',
'2022-02-28',
'2022-03-31',
'2022-04-30',
'2022-05-31',
'2022-06-30',
'2022-07-31',
'2022-08-31',
'2022-09-30',
'2022-10-31',
'2022-11-30',
'2022-12-31'
]

output_directory = "./traffic_data"
os.makedirs(output_directory, exist_ok=True)

for index in tqdm(range(0, len(dates), 1), desc="Processing dates"):
    try:
        s3_traffic = TrafficDataDC(date=dates[index], limit = 0)
        s3_traffic.run_athena_query(print_out=False)
        s3_traffic.process()
        filename = f"{output_directory}/traffic_data_{dates[index]}.csv"
        s3_traffic.df.to_csv(filename, index=False)
        print(f"Data for {dates[index]} saved successfully.")
    except Exception as e:
        print(f"An error occurred for date {dates[index]}: {e}")
    finally:
        del s3_traffic