In [1]:
import os
import pandas as pd
import tqdm

In [2]:
in_folder_path = "../data/Kraken_OHLCVT/"
out_folder_path = "../data/Kraken_cleaned/"
all_files = os.listdir(in_folder_path)

In [3]:
usd_pairs = list(set([x.split("_")[0] for x in all_files if "USD_" in x]))
hr_pairs = [x for x in usd_pairs if x + "_60.csv" in all_files]
hr_files = [x + "_60.csv" for x in hr_pairs]
non_hr_pairs = [x for x in usd_pairs if x + "_60.csv" not in all_files]
non_hr_files = [x + "_1.csv" for x in non_hr_pairs if x + "_1.csv" in all_files]

In [4]:
def add_colnames(input_data, column_names=['timestamp', 'open', 'high', 'low', 'close', 'volume', 'trades']):
    if isinstance(input_data, str):
        df = pd.read_csv(input_data, header=None)
    elif isinstance(input_data, pd.DataFrame):
        df = input_data.copy()
    num_cols = df.shape[1]
    num_expected = len(column_names)
    
    if num_cols != num_expected:
        raise ValueError(f"DataFrame has {num_cols} columns, but {num_expected} column names were provided: {column_names}")
    df.columns = column_names
    return df

In [11]:
for i in tqdm.tqdm(range(len(non_hr_files))):
    try:
        with open (in_folder_path + non_hr_files[i], 'r') as f:
            first_line = f.readline()
            if "timestamp" in first_line:
                df = pd.read_csv(in_folder_path + non_hr_files[i])
            else:
                df = add_colnames(in_folder_path + non_hr_files[i])
    except pd.errors.EmptyDataError:
        continue
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
    df = df.resample('h', on='timestamp').agg({
        'open': 'first',
        'high': 'max',
        'low': 'min',
        'close': 'last',
        'volume': 'sum',
        'trades': 'sum'
    }).dropna().reset_index()
    df.to_parquet(out_folder_path + non_hr_files[i].replace('.csv', '.parquet').replace("_1", "_60"), index=False)

100%|██████████| 1/1 [00:00<00:00,  2.08it/s]


In [13]:
for i in tqdm.tqdm(range(len(hr_files))):
    try:
        df = add_colnames(in_folder_path + hr_files[i])
    except pd.errors.EmptyDataError:
        continue
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
    df = df.resample('h', on='timestamp').agg({
        'open': 'first',
        'high': 'max',
        'low': 'min',
        'close': 'last',
        'volume': 'sum',
        'trades': 'sum'
    }).dropna().reset_index()
    df.to_parquet(out_folder_path + hr_files[i].replace('.csv', '.parquet'), index=False)

100%|██████████| 412/412 [00:08<00:00, 48.66it/s]
