In [1]:
# import libraries
import pandas as pd
import numpy as np
import time
from datetime import datetime, timezone
import os

from binance.spot import Spot # pip install binance-connector-python

# Create class to scrap the data

In [2]:
class getData:
    """
    get the data from binance on a specific pair
    """
    def __init__(self, pair='BTCUSDT', interval='1m', pitch=500, start_date='', end_date=''):
        # setup basic variables
        self.pair=pair # pair to scrap. Example: BTCUSDT
        self.interval=interval # Interval of time to consider: 1s, 1m, 5m, 15m, 30m, 1h, 2h, 4h, 6h, 8h, 12h, 1d, 3d
        self.pitch=pitch # Number of points to collect at each call
        self.pitch_ms=self.pitch_to_ms() # Number of ms composing the duration (interval * pitch)

        # setup the start and end dates (to be updated by the user depending on the needs)
        self.startDate=self.setDate(year=2023, month=1, day=1, hour=0, min=0) if start_date=='' else start_date # set the default start date
        self.endDate=self.setDate(year=2023, month=1, day=1, hour=2, min=0) if end_date=='' else end_date # set the default end date

        # setup the path to record the data
        self.file_path="./"

        # setup the api call
        self.base_url = "https://api4.binance.com"
        self.client=Spot(self.base_url)

        # colums names
        self.columns_name = ['openT',
           'open', 'high', 'low', 'close', 'baseVol', 'closeT','quoteVol','nbTrade',
           'takerBaseVol', 'takerQuoteVol', '0']

    def setDate(self, year=2023, month=1, day=1, hour=0, min=0):
        """
        This function returns the timestamp in ms and within the timezone utc
        """
        return int(datetime(year, month, day, hour, min, tzinfo=timezone.utc).timestamp()*1000)

    def pitch_to_ms(self):
        """
        This function return the number of ms equivalent to the (pitch * interval)
        """
        # dict with the intervals fiven in number of seconds
        interval_to_sec = {
            '1s':1, 
            '1m':60,
            '5m':60*5,
            '15m':60*15,
            '30m':60*30, 
            '1h':60*60, 
            '2h':60*60*2, 
            '4h':60*60*4, 
            '6h':60*60*6, 
            '8h':60*60*8, 
            '12h':60*60*12, 
            '1d':86400, 
            '3d':86400*3, 
            '1w':86400*7
        }
        try:
            #return msdatetime+self.pitch*interval_to_sec[self.interval]*1000
            return self.pitch*interval_to_sec[self.interval]*1000
        except:
            print("Error with the pitch_to_ms function.")

    def getKlines(self, file_name=''):
        """
        This is the main function to be used by the user to scrap the binance data for a ticker.
        It will get the data from a defined folder. If some historical data exists, it will update the data. If not, it will scrap the bull historical.
        """

        # create the file name
        file_name = self.pair+'-'+self.interval+'-raw.csv' if file_name=='' else file_name
        
        if os.path.isfile(os.path.join(self.file_path, file_name)):
            # if the file already exists, open it
            df_temp = pd.read_csv(os.path.join(self.file_path, file_name))

            # take the last entry as a start date for the update
            self.startDate = df_temp.iloc[-1,0]
            print('Data will be updated stating from the following timestamp (ms) : ', self.startDate, ' (index:', df_temp.index[-1],')')
        else:
            df_temp = None
            print('There is no file to update')

        # get the klines
        df = self.updateKlines()

        # if df_temp exists, concat df_temp and df
        if isinstance(df_temp, pd.DataFrame):
            df = pd.concat([df, df_temp], axis=0, ignore_index=True)

        # clean the data
        df = self.cleanKlines(df)

        # export to csv
        df.to_csv(os.path.join(self.file_path, file_name), index=False)
        print('Data updated')
        return df

    def updateKlines(self):
        """
        This function collect all the data from a startDate to endDate
        """
        klines = []
        start = self.startDate
        while start < self.endDate:
            # Get klines of the pair for the defined interval
            klines = klines + self.client.klines(self.pair, self.interval, startTime=start, limit=self.pitch)
            start += self.pitch_ms

        # transformation to dataframe
        df = pd.DataFrame(klines, columns=self.columns_name)

        # removal of the last line for which the data might not be frozen (price still moving)
        df = df.iloc[:-1]

        # removal of the last column which is a null column
        df.drop(df.columns[-1], axis=1, inplace=True)

        # update the dtypes to numeric
        df = df.apply(pd.to_numeric)

        # clean the data
        df = self.cleanKlines(df)

        # return the dataframe
        return df

    def cleanKlines(self, data):
        """
        This function "clean" the data in a consistent way
        """
        return data.sort_values(by='openT').drop_duplicates().reset_index(drop=True)


# Get the data

In [3]:
# Example:
params = {
    'pair': 'BTCUSDT',
    'interval': '1d',
    'pitch': 1000,
    'start_date': int(datetime(year=2017, month=1, day=1, hour=0, minute=0, tzinfo=timezone.utc).timestamp()*1000),
    'end_date': int(time.time()*1000)
}

btcusdt = getData(**params)
btcusdt.file_path = "./binance_raw_data/"
btcusdt.getKlines()

Data will be updated stating from the following timestamp (ms) :  1705190400000  (index: 2341 )
Data updated


Unnamed: 0,openT,open,high,low,close,baseVol,closeT,quoteVol,nbTrade,takerBaseVol,takerQuoteVol
0,1502928000000,4261.48,4485.39,4200.74,4285.08,795.150377,1503014399999,3.454770e+06,3427,616.248541,2.678216e+06
1,1503014400000,4285.08,4371.52,3938.77,4108.37,1199.888264,1503100799999,5.086958e+06,5233,972.868710,4.129123e+06
2,1503100800000,4108.37,4184.69,3850.00,4139.98,381.309763,1503187199999,1.549484e+06,2153,274.336042,1.118002e+06
3,1503187200000,4120.98,4211.08,4032.62,4086.29,467.083022,1503273599999,1.930364e+06,2321,376.795947,1.557401e+06
4,1503273600000,4069.13,4119.62,3911.79,4016.00,691.743060,1503359999999,2.797232e+06,3972,557.356107,2.255663e+06
...,...,...,...,...,...,...,...,...,...,...,...
2356,1706486400000,42031.05,43333.00,41804.88,43302.70,31542.742070,1706572799999,1.343462e+09,1197856,15996.886270,6.815047e+08
2357,1706572800000,43302.71,43882.36,42683.99,42941.10,37619.245460,1706659199999,1.633251e+09,1316610,19317.299930,8.388764e+08
2358,1706659200000,42941.10,43745.11,42276.84,42580.00,39871.136880,1706745599999,1.711666e+09,1499559,20289.948250,8.710097e+08
2359,1706745600000,42580.00,43285.13,41884.28,43082.94,35231.046640,1706831999999,1.497121e+09,1392269,18134.843040,7.706355e+08
