# This simple snippet is to read the crypto_data folder and combine the checkpoints plus the final 2h dataset to a single dataset

In [1]:
import os
import pandas as pd


In [3]:
folder_path = 'crypto_data'
if not os.path.exists(folder_path):
    raise FileNotFoundError(f"The folder '{folder_path}' does not exist.")
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
if not csv_files:
    raise ValueError("No CSV files found in the specified folder.")
csv_files_sorted = sorted(
    csv_files,
    key=lambda x: int(x.split('_checkpoint_')[1].split('_')[0]) if 'checkpoint' in x else float('inf')
)
combined_df = pd.DataFrame()
for file in csv_files_sorted:
    file_path = os.path.join(folder_path, file)
    try:
        temp_df = pd.read_csv(file_path)
        combined_df = pd.concat([combined_df, temp_df], ignore_index=True)
    except Exception as e:
        print(f"Error reading file {file}: {e}")
print("Combined DataFrame:")
combined_df.to_csv('HFT_100ms_unresampled_data_combined_data.csv', index=False)
print(combined_df.head(10))

Combined DataFrame:
                 timestamp  bid_price  ask_price  trade_price    volume  \
0  2025-04-16 18:41:26.309        NaN     0.0164          NaN    0.0000   
1  2025-04-16 18:41:26.409        NaN     0.0164          NaN    0.0000   
2  2025-04-16 18:41:26.509     0.0145     0.0164          NaN    0.0000   
3  2025-04-16 18:41:26.609     0.0145     0.0167       0.0160  123.8672   
4  2025-04-16 18:41:26.673     0.0145     0.0167       0.0160  123.8672   
5  2025-04-16 18:41:26.709     0.0160     0.0161       0.0160   21.7105   
6  2025-04-16 18:41:26.741     0.0160     0.0161       0.0160   21.7105   
7  2025-04-16 18:41:26.784     0.0160     0.0161       0.0161   21.7105   
8  2025-04-16 18:41:26.809     0.0160     0.0161       0.0161    0.0000   
9  2025-04-16 18:41:26.909     0.0160     0.0161       0.0160  146.5408   

   mid_price  
0        NaN  
1        NaN  
2        NaN  
3        NaN  
4        NaN  
5    0.01605  
6    0.01605  
7    0.01605  
8    0.01605  
9   

# Resampling to 1s for rapid check of a trend at a particular duration ie 2 hours


In [3]:
combined_df['timestamp'] = pd.to_datetime(combined_df['timestamp'])
combined_df = combined_df.set_index('timestamp')
resampled_df = combined_df.resample('1s').agg({
    'bid_price': 'last',      # Last bid price in the second
    'ask_price': 'last',      # Last ask price in the second
    'trade_price': 'last',    # Last trade price in the second
    'volume': 'sum',          # Sum of volume within the second
    'mid_price': 'last'       # Last mid price in the second
})

resampled_df = resampled_df.reset_index()

# Show the result
print(resampled_df.head())

# Check the reduction in rows
print(f"Original rows: {len(combined_df)}")
print(f"Resampled rows: {len(resampled_df)}")



            timestamp  bid_price  ask_price  trade_price       volume  \
0 2025-04-16 18:41:26      0.016     0.0161        0.016    459.40670   
1 2025-04-16 18:41:27      0.016     0.0161        0.016   3298.51151   
2 2025-04-16 18:41:28      0.016     0.0161        0.016   2543.05790   
3 2025-04-16 18:41:29      0.016     0.0161        0.016   1520.15988   
4 2025-04-16 18:41:30      0.016     0.0160        0.016  24966.98970   

   mid_price  
0    0.01605  
1    0.01605  
2    0.01605  
3    0.01605  
4    0.01600  
Original rows: 50223
Resampled rows: 3600


In [4]:
# Save the 1s data : 
resampled_df.to_csv('HFT_1_hr_combined_crypto_data_1s.csv', index=False)

In [5]:
# Save the combined DataFrame to a new CSV file
combined_df.to_csv('HFT_1_hr_combined_crypto_data.csv', index=False)