In [1]:
import os
import requests
import zipfile
import glob
from datetime import datetime, timedelta
import pandas as pd
import re
from tqdm import tqdm
import concurrent.futures

In [2]:
BASE_URL = "https://data.binance.vision/data/spot/daily/klines/ETHUSDT/1s/"

start_date = datetime(2017, 8, 17) 
end_date = datetime(2024, 8, 28)

file_format = "zip"

DOWNLOAD_DIR="binance_data"
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
EXTRACT_DIR = os.path.join(DOWNLOAD_DIR, "extracted_csv")
os.makedirs(EXTRACT_DIR, exist_ok=True)
MERGED_DIR = os.path.join(DOWNLOAD_DIR, "merged_data")
os.makedirs(MERGED_DIR, exist_ok=True)

In [3]:
dates = [start_date + timedelta(days=i) for i in range((end_date - start_date).days + 1)]
file_list = [f"ETHUSDT-1s-{date.strftime('%Y-%m-%d')}.{file_format}" for date in dates]

def download_file(file_name):
    file_url = BASE_URL + file_name
    save_path = os.path.join(DOWNLOAD_DIR, file_name)
    
    if os.path.exists(save_path):
        return file_name  # Skip if already downloaded
    
    for attempt in range(3):  # Retry up to 3 times
        try:
            response = requests.get(file_url, stream=True, timeout=10)
            if response.status_code == 200:
                with open(save_path, "wb") as f:
                    for chunk in response.iter_content(chunk_size=1024):
                        f.write(chunk)
                return file_name  # Successfully downloaded
            else:
                print(f"Failed: {file_name}, Status Code: {response.status_code}")
        except requests.RequestException as e:
            print(f"Error downloading {file_name}: {e}")
    return None  # Failed after retries

# Download with threading
with tqdm(total=len(file_list), desc="Downloading", unit="file") as pbar:
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(download_file, file): file for file in file_list}
        for future in concurrent.futures.as_completed(futures):
            if future.result():
                pbar.update(1)

Downloading: 100%|████████████████████████████████████████████████████████████| 2569/2569 [00:00<00:00, 10681.65file/s]


In [4]:
zip_files = glob.glob(os.path.join(DOWNLOAD_DIR, "*.zip"))

# Single progress bar for total extractions
with tqdm(total=len(zip_files), desc="Extracting Files", unit="file") as pbar:
    for zip_file in zip_files:
        with zipfile.ZipFile(zip_file, "r") as zip_ref:
            zip_ref.extractall(EXTRACT_DIR)
        pbar.update(1)  # Update progress bar after each extraction

print("All files extracted successfully.")

Extracting Files: 100%|██████████████████████████████████████████████████████████| 2569/2569 [09:43<00:00,  4.41file/s]

All files extracted successfully.





In [5]:
def extract_date(filename):
    match = re.search(r"(\d{4}-\d{2}-\d{2})", filename)  # Extracts YYYY-MM-DD
    return match.group(1) if match else ""

def merge_csv(file_list, output_file):
    df_list = []
    
    # Progress bar setup
    with tqdm(total=len(file_list), desc="Merging CSV Files", unit="file") as pbar:
        for file in file_list:
            df = pd.read_csv(file, compression=None, header=None)  # No headers
            df_list.append(df)
            pbar.update(1)  # Update progress bar

    if not df_list:
        print(f"No valid data for {output_file}. Skipping...")
        return
    
    merged_df = pd.concat(df_list, ignore_index=True)
    merged_df = merged_df.sort_values(by=0)  # Sort by timestamp (assuming first column is timestamp)
    merged_df.to_csv(output_file, index=False, header=False)
    
    print(f"Saved merged file: {output_file}")

# Get list of extracted CSV files
csv_files = glob.glob(os.path.join(EXTRACT_DIR, "*.csv"))

# Sort CSV files by date in filename
csv_files_sorted = sorted(csv_files, key=extract_date)

# Filter based on years
csv_2017_2020 = [f for f in csv_files_sorted if "2017" in f or "2018" in f or "2019" in f or "2020" in f]
csv_2021_2024 = [f for f in csv_files_sorted if "2021" in f or "2022" in f or "2023" in f or "2024" in f]

# Define output filenames
output_file_1 = os.path.join(MERGED_DIR, "ETHUSDT_1s_2017_2020.csv")
output_file_2 = os.path.join(MERGED_DIR, "ETHUSDT_1s_2021_2024.csv")

# Merge CSV files with progress bar
merge_csv(csv_2017_2020, output_file_1)
merge_csv(csv_2021_2024, output_file_2)

Merging CSV Files: 100%|█████████████████████████████████████████████████████████| 1233/1233 [03:12<00:00,  6.40file/s]


Saved merged file: binance_data\merged_data\ETHUSDT_1s_2017_2020.csv


Merging CSV Files: 100%|█████████████████████████████████████████████████████████| 1336/1336 [03:54<00:00,  5.71file/s]


Saved merged file: binance_data\merged_data\ETHUSDT_1s_2021_2024.csv


In [6]:
def compress_csv(file_path):
    compressed_path = file_path + ".gz"
    with open(file_path, 'rb') as f_in, open(compressed_path, 'wb') as f_out:
        f_out.write(f_in.read())
    os.remove(file_path)  # Delete original CSV
    print(f"Compressed and saved: {compressed_path}")

compress_csv(output_file_1)
compress_csv(output_file_2)

# Delete all intermediate files
for file in zip_files + csv_files_sorted:
    os.remove(file)
print("Deleted all intermediate files.")

Compressed and saved: binance_data\merged_data\ETHUSDT_1s_2017_2020.csv.gz
Compressed and saved: binance_data\merged_data\ETHUSDT_1s_2021_2024.csv.gz
Deleted all intermediate files.


In [9]:
import gzip
import shutil

def compress_and_cleanup(csv_file, directory, buffer_size=1024*1024):
    """Compress a CSV file in the given directory to .csv.gz using buffering and delete the original."""
    csv_path = os.path.join(directory, csv_file)
    gz_path = csv_path + ".gz"

    if not os.path.exists(csv_path):
        print(f"File not found: {csv_path}")
        return

    with open(csv_path, 'rb') as f_in, gzip.open(gz_path, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out, length=buffer_size)

    # os.remove(csv_path)  # Delete original CSV file
    print(f"Compressed: {gz_path} and removed {csv_path}")

compress_and_cleanup("ETHUSDT_1s_2017_2020.csv", MERGED_DIR)
compress_and_cleanup("ETHUSDT_1s_2021_2024.csv", MERGED_DIR)

Compressed: binance_data\merged_data\ETHUSDT_1s_2017_2020.csv.gz and removed binance_data\merged_data\ETHUSDT_1s_2017_2020.csv
Compressed: binance_data\merged_data\ETHUSDT_1s_2021_2024.csv.gz and removed binance_data\merged_data\ETHUSDT_1s_2021_2024.csv
