In [26]:
import logging, json
import boto3
import pandas as pd
from io import StringIO,BytesIO # to read csv using s3 storage
from datetime import datetime, timedelta 

In [48]:
def return_objects(bucket,arg_date,source_date_format):
    return return_date_list(bucket,arg_date,source_date_format)

def list_files_in_prefix(bucket, prefix):
    files = [obj for obj in bucket.objects.filter(Prefix=prefix)]
    return files

def return_date_list(bucket,arg_date,source_date_format):
    min_date = datetime.strptime(arg_date,source_date_format).date() - timedelta(days=1) 
    today_date = datetime.today().date()
    list = [(min_date + timedelta(days=x)).strftime(source_date_format) for x in range(0, (today_date-min_date).days + 1)]
    return list

#     arg_date_dt = datetime.strptime(arg_date,source_date_format).date() - timedelta(days=1) 
#     objects = [obj for obj in bucket.objects.all() if datetime.strptime(str(obj.key).split('/')[0], '%Y-%m-%d').date() >= arg_date_dt]
#     return objects

In [17]:
def return_report_1(objects, bucket, cols_needed):
    df_all = pd.DataFrame()
    df_all = return_inital_df(bucket, objects)
    df_all_final = transport_report_1(df_all, cols_needed)
    return df_all_final

In [18]:
def return_inital_df(bucket, prefix):
    objects = list_files_in_prefix(bucket, prefix)
    df_all = pd.concat([read_csv_to_df(obj.key, bucket) for obj in objects],ignore_index=True)
    return df_all    

In [19]:
def read_csv_to_df(filename, bucket, decoder='utf-8', delimiter=','):
    csv_obj = bucket.Object(key=filename).get().get('Body').read().decode(decoder)
    data = StringIO(csv_obj)
    df = pd.read_csv(data, delimiter=delimiter)
    return df

In [20]:
def transport_report_1(df_all, cols_needed):
    df_all = df_all.loc[:,cols_needed]
    df_all.dropna(inplace=True)

    # calculatig data needed and creatin`g the df output
    df_all = xetra_report_1(df_all)
    return df_all

In [21]:
def xetra_report_1(df_all):
    df_all['opening_price'] = df_all.sort_values(by=['Time']).groupby(['ISIN','Date'])['StartPrice'].transform('first')
    df_all['closing_price'] = df_all.sort_values(by=['Time']).groupby(['ISIN','Date'])['StartPrice'].transform('last')
    df_all = df_all.groupby(['ISIN','Date'], as_index=True)
    df_all = df_all.agg(
        opening_price_euro=('opening_price','min'), 
        closing_price_euro=('closing_price','min'), 
        minimum_price_euro=('MinPrice','min'), 
        maxmimum_price_euro=('MaxPrice','max'), 
        daily_traded_volume=('TradedVolume','sum') )
    df_all['prev_closing_price'] = df_all.sort_values(by=['Date']).groupby(['ISIN'])['closing_price_euro'].shift(1)
    df_all['change_prev_closing_precent'] = (df_all['closing_price_euro'] - df_all['prev_closing_price']) / df_all['prev_closing_price'] * 100
    df_all.drop(columns=['closing_price_euro'], inplace=True)
    df_all = df_all.round(decimals=2)
    return df_all

In [22]:
def load_report1(bucket_target, df_all):
    key = 'parquet/xetra_daily_report_'+datetime.today().strftime('%y%m%d_%H%M%S')+'.parquet'
    output_buffer = BytesIO()
    df_all.to_parquet(output_buffer, index=False)
    bucket_target.put_object(Body=output_buffer.getvalue(),Key=key)

In [None]:
def main():

    arg_date = '2022-12-28'
    source_date_format = '%Y-%m-%d'
    source_bucket = 'xetra-1234'
    target_bucket = 'xetra-destination'
    cols_needed = ['ISIN', 'Date', 'Time', 'StartPrice', 'MaxPrice', 'MinPrice','EndPrice', 'TradedVolume', 'NumberOfTrades']

    s3 = boto3.resource('s3')
    bucket = s3.Bucket(source_bucket) 
    bucket_target = s3.Bucket(target_bucket)
    
    # Extract
    objects = return_objects(bucket,arg_date,source_date_format)

    # Transform
    df_all_final = return_report_1(objects, bucket, cols_needed)

    # Load
    load_report1(bucket_target, df_all_final)


In [51]:
arg_date = '2022-12-17'
source_date_format = '%Y-%m-%d'
source_bucket = 'xetra-1234'
s3 = boto3.resource('s3')
bucket = s3.Bucket(source_bucket)
return_objects(bucket,arg_date,source_date_format)

['2022-12-16',
 '2022-12-17',
 '2022-12-18',
 '2022-12-19',
 '2022-12-20',
 '2022-12-21',
 '2022-12-22',
 '2022-12-23',
 '2022-12-24',
 '2022-12-25',
 '2022-12-26',
 '2022-12-27',
 '2022-12-28',
 '2022-12-29',
 '2022-12-30',
 '2022-12-31',
 '2023-01-01',
 '2023-01-02']

In [42]:
return_date_list

['2022-12-27',
 '2022-12-28',
 '2022-12-29',
 '2022-12-30',
 '2022-12-31',
 '2023-01-01',
 '2023-01-02']