In [13]:
import os
import glob
import pandas as pd
import zipfile
import warnings

# Define the expected column names
COLUMN_NAMES = [
    "Open_time", "Open", "High", "Low", "Close", 
    "Volume", "Close_time", "Quote_asset_volume", 
    "Number_of_trades", "Taker_buy_base_asset_volume", 
    "Taker_buy_quote_asset_volume", "Ignore"
]

def find_zip_files(directory, pair_keyword, time_frame_keyword):
    """
    Find all ZIP files in the given directory that contain the pair and time frame keywords in their name.
    """
    search_path = os.path.join(directory, f"*{pair_keyword}*{time_frame_keyword}*.zip")
    return glob.glob(search_path)

def load_and_concatenate_csvs(directory):
    """
    Load all CSV files in a given directory, concatenate them, sort by Open_time,
    and remove duplicates.
    """
    all_files = glob.glob(os.path.join(directory, "*.csv"))
    
    # Load and concatenate all CSVs without specifying column names initially
    df_list = [pd.read_csv(file, header=None) for file in all_files]
    concatenated_df = pd.concat(df_list, ignore_index=True)
    
    # Remove rows that might have textual data (like headers)
    concatenated_df = concatenated_df[concatenated_df[0].apply(lambda x: str(x).isnumeric())]
    
    # Assign the correct column names
    concatenated_df.columns = COLUMN_NAMES
    
    # Convert the 'Open_time' column to datetime format (this will help in the subsequent assertion function)
    concatenated_df['Open_time'] = pd.to_datetime(concatenated_df['Open_time'], unit='ms')
    
    # Sort by Open_time and remove duplicates
    sorted_df = concatenated_df.sort_values(by="Open_time")
    deduplicated_df = sorted_df.drop_duplicates(subset="Open_time", keep="last")
    
    return deduplicated_df

def assert_consistent_time_intervals(df, time_column="Open_time", interval=pd.Timedelta(hours=1)):
    """
    Check that the time intervals in the given DataFrame's time column are consistent.
    If inconsistencies are found, issue a warning.
    """
    # Calculate time differences
    time_diffs = df[time_column].diff().dropna()
    
    if not (time_diffs <= interval).all():
        warnings.warn("Found inconsistent time intervals exceeding the specified limit.")

def extract_zip_file(zip_file, extract_to):
    """Extract the contents of a ZIP file to a given directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)


directory = r"C:\Users\xyz\Downloads"  # Adjust this to the directory where the ZIP files are located.
temp_extract_dir = "/temp_extracted"
os.makedirs(temp_extract_dir, exist_ok=True)

pair_keyword = "ETHUSDT"
time_frame_keyword = "1h"

# Main execution
zip_files = find_zip_files(directory, pair_keyword, time_frame_keyword)

# Extract files and load the data
for zip_file in zip_files:
    extract_zip_file(zip_file, temp_extract_dir)
    
final_df = load_and_concatenate_csvs(temp_extract_dir)

# Run the assertion on the final DataFrame
assert_consistent_time_intervals(final_df)

final_df.head()



  concatenated_df['Open_time'] = pd.to_datetime(concatenated_df['Open_time'], unit='ms')


Unnamed: 0,Open_time,Open,High,Low,Close,Volume,Close_time,Quote_asset_volume,Number_of_trades,Taker_buy_base_asset_volume,Taker_buy_quote_asset_volume,Ignore
0,2020-01-01 00:00:00,129.16,129.19,128.68,128.87,7769.17336,1577840399999,1000929.742211,2504,4149.93345,534619.338966,0
1,2020-01-01 01:00:00,128.87,130.65,128.78,130.64,11344.65516,1577843999999,1474278.481637,4885,5930.54276,770486.056677,0
2,2020-01-01 02:00:00,130.63,130.98,130.35,130.85,7603.35623,1577847599999,994025.614057,3046,3324.35218,434675.444655,0
3,2020-01-01 03:00:00,130.85,130.89,129.94,130.2,4968.55433,1577851199999,647360.952861,2818,1810.03564,235890.330197,0
4,2020-01-01 04:00:00,130.21,130.74,130.15,130.2,3397.90747,1577854799999,443006.650685,2264,1839.74371,239848.348335,0


In [14]:
final_df

Unnamed: 0,Open_time,Open,High,Low,Close,Volume,Close_time,Quote_asset_volume,Number_of_trades,Taker_buy_base_asset_volume,Taker_buy_quote_asset_volume,Ignore
0,2020-01-01 00:00:00.000,129.16,129.19,128.68,128.87,7769.17336,1577840399999,1000929.742211,2504,4149.93345,534619.338966,0
1,2020-01-01 01:00:00.000,128.87,130.65,128.78,130.64,11344.65516,1577843999999,1474278.481637,4885,5930.54276,770486.056677,0
2,2020-01-01 02:00:00.000,130.63,130.98,130.35,130.85,7603.35623,1577847599999,994025.614057,3046,3324.35218,434675.444655,0
3,2020-01-01 03:00:00.000,130.85,130.89,129.94,130.2,4968.55433,1577851199999,647360.952861,2818,1810.03564,235890.330197,0
4,2020-01-01 04:00:00.000,130.21,130.74,130.15,130.2,3397.90747,1577854799999,443006.650685,2264,1839.74371,239848.348335,0
...,...,...,...,...,...,...,...,...,...,...,...,...
30640,2023-08-11 18:59:42.720,1843.95,1844.61,1841.64,1843.97,24912.983,1691783999999,45923077.59349,23582,12251.804,22583969.24324,0
30641,2023-08-11 20:00:52.736,1843.98,1845.10,1843.22,1845.09,18963.405,1691787599999,34970729.01708,18530,11072.450,20419108.55333,0
30642,2023-08-11 20:59:51.680,1845.10,1845.84,1843.00,1844.83,14427.807,1691791199999,26612824.01970,18769,5917.530,10915847.39706,0
30643,2023-08-11 22:01:01.696,1844.82,1846.67,1844.27,1845.72,12951.209,1691794799999,23901410.68321,14867,6731.677,12423371.67396,0


In [15]:
final_df.to_csv("eth1h.csv")

In [16]:
os.listdir(temp_extract_dir)

['ETHUSDT-1h-2020-01.csv',
 'ETHUSDT-1h-2020-02.csv',
 'ETHUSDT-1h-2020-03.csv',
 'ETHUSDT-1h-2020-04.csv',
 'ETHUSDT-1h-2020-05.csv',
 'ETHUSDT-1h-2020-06.csv',
 'ETHUSDT-1h-2020-07.csv',
 'ETHUSDT-1h-2020-08.csv',
 'ETHUSDT-1h-2020-09.csv',
 'ETHUSDT-1h-2020-10.csv',
 'ETHUSDT-1h-2020-11.csv',
 'ETHUSDT-1h-2020-12.csv',
 'ETHUSDT-1h-2021-01.csv',
 'ETHUSDT-1h-2021-02.csv',
 'ETHUSDT-1h-2021-03.csv',
 'ETHUSDT-1h-2021-04.csv',
 'ETHUSDT-1h-2021-05.csv',
 'ETHUSDT-1h-2021-06.csv',
 'ETHUSDT-1h-2021-07.csv',
 'ETHUSDT-1h-2021-08.csv',
 'ETHUSDT-1h-2021-09.csv',
 'ETHUSDT-1h-2021-10.csv',
 'ETHUSDT-1h-2021-11.csv',
 'ETHUSDT-1h-2021-12.csv',
 'ETHUSDT-1h-2022-01.csv',
 'ETHUSDT-1h-2022-02.csv',
 'ETHUSDT-1h-2022-03.csv',
 'ETHUSDT-1h-2022-04.csv',
 'ETHUSDT-1h-2022-05.csv',
 'ETHUSDT-1h-2022-06.csv',
 'ETHUSDT-1h-2022-08.csv',
 'ETHUSDT-1h-2022-09.csv',
 'ETHUSDT-1h-2022-10.csv',
 'ETHUSDT-1h-2022-11.csv',
 'ETHUSDT-1h-2022-12.csv',
 'ETHUSDT-1h-2023-01.csv',
 'ETHUSDT-1h-2023-02.csv',
 