#### Binance Project

This project is dedicated to collecting accurate price and timestamp data from Binance using both the REST API and Websocket. the goal is to build a reliable data pipeline that extracts raw market data, transforms it into a clean and usable format, loads it into the appropriate storage layers, and ultimately analyzes it to uncover insights.

To maintain clarity and structure throughout the development process, the documentation and codebase will be organized into four main sections: data extraction, data transformation, data loading, and data analysis. Each section will detail the methods, tools, and design choices involved, creating a clear end-to-end overview of the entire workflow.

In [31]:
import websocket
import json
import pandas as pd
from binance.client import Client
import time

In [32]:
ws_data = []
start_time = None
duration_time = None
ws = None

symbol = "btcusdc"
interval = "1s"
socket_info = f"wss://stream.binance.com:9443/ws/{symbol}@kline_{interval}"

api_key = "SSPb2vTZrlFlSvq08yoVOVcEQeGI7MEryavlnoLikFNqDoEpwcBOcD2GhNlEilGi"
secret_api_key = "GEjFIi5B2a50aikp4MlAJZ7yue0lBsZJ9pTxcprtwMbnKH2TZilkDz9h3YUjhSTo"
client = Client(api_key,secret_api_key)

btc = "BTCUSDC"
time_frame = "1s"
lookback_period = "1 minute ago"


In [33]:

def on_message(ws,msg):

    msg = json.loads(msg)
    candlestick = msg["k"]

    if candlestick["x"]:
        ws_data.append({
            "timestamp":candlestick["t"],
            "price_ws":float(candlestick["c"])})
        
    print("Data stream is intact and running, wait for the desired duration to end")
    
    if time.time() - start_time >= duration_time:
        print(f"Reached {duration_time} seconds. Closing WebSocket...")
        ws.close()

def on_error(ws, error):
    print("ERROR:", error)

def on_close(ws, code, msg):
    print("CONNECTION IS CLOSED")

def start_ws(seconds:int):

    global start_time,duration_time,ws

    start_time = time.time()
    duration_time = seconds

    ws = websocket.WebSocketApp(
        socket_info,
        on_message=on_message,
        on_error=on_error,
        on_close=on_close
    )
    ws.run_forever()

start_ws(60)

df_ws = pd.DataFrame(ws_data)
df_ws["timestamp"] = pd.to_datetime(df_ws["timestamp"], unit="ms")

def get_btc_data(btc,time_frame,lookback_period):

    btc_info = client.get_historical_klines(btc,time_frame,lookback_period)

    df_rest = pd.DataFrame(btc_info, columns = ["timestamp", "open","high","low","close","volume","close_time",
                                               "quote_asset_volume","number_of trades","taker_buy_base_asset_volume",
                                                "taker_buy_quote_asset_volume","ignore"])
    
    df_rest["timestamp"]= pd.to_datetime(df_rest["close_time"],unit="ms").dt.floor("s")
    df_rest["price_rest"] = (pd.to_numeric(df_rest["close"]).round(2))

    return df_rest[["timestamp","price_rest"]]

df_rest = pd.DataFrame(get_btc_data(btc,time_frame,lookback_period))

df_final = pd.merge(df_rest, df_ws, on="timestamp", how="inner")


Data stream is intact and running, wait for the desired duration to end
Data stream is intact and running, wait for the desired duration to end
Data stream is intact and running, wait for the desired duration to end
Data stream is intact and running, wait for the desired duration to end
Data stream is intact and running, wait for the desired duration to end
Data stream is intact and running, wait for the desired duration to end
Data stream is intact and running, wait for the desired duration to end
Data stream is intact and running, wait for the desired duration to end
Data stream is intact and running, wait for the desired duration to end
Data stream is intact and running, wait for the desired duration to end
Data stream is intact and running, wait for the desired duration to end
Data stream is intact and running, wait for the desired duration to end
Data stream is intact and running, wait for the desired duration to end
Data stream is intact and running, wait for the desired duration

In [34]:
max_price_rest = 0

for price in df_rest["price_rest"]:
    if price > max_price_rest:
        max_price_rest = price

max_price_ws = 0
for price in df_ws['price_ws']:

    if price > max_price_ws:
        max_price_ws = price

def high(max_price_rest,max_price_ws):

    if max_price_rest == max_price_ws:
        return f"The Highest price from both is equal and is {max_price_rest}"
    else:
        if max_price_rest > max_price_ws:
            return f"Price from REST is bigger and its {max_price_rest}"
        else:
            return f"Price from WS is bigger and its {max_price_ws}"

min_price_rest = float("inf")

for price in df_rest["price_rest"]:

    if price < min_price_rest:
        min_price_rest = min(price,min_price_rest)

min_price_ws = float("inf")

for price in df_ws["price_ws"]:

    if price < min_price_ws:
        min_price_ws = min(price,min_price_ws)

def low(min_price_ws,min_price_rest):

    if min_price_ws == min_price_ws:
        return f"The Lowest price from both is equal and is {min_price_ws}"
    else:
        if min_price_ws > min_price_rest:
            return f" Price from REST is lower and is {min_price_rest}"
        else:
            return f" Price from WS is lower and is {min_price_ws}"

def volume(arr):

    return f"The volume for WS per minute is {len(arr)}"
            
mean =  f"The overall mean is {float(df_final[["price_rest", "price_ws"]].stack().mean().round(2))}"

In [35]:
high(max_price_rest,max_price_ws)

'Price from WS is bigger and its 84517.28'

In [36]:
low(min_price_rest,min_price_ws)

'The Lowest price from both is equal and is 84471.01'

In [37]:
volume(df_ws["price_ws"])

'The volume for WS per minute is 60'

In [38]:
mean

'The overall mean is 84483.11'

In [39]:
# MA made for both rest and ws incase of price diff


In [42]:
df_final["10MA_rest"] = df_final["price_rest"].rolling(10).mean().round(2)
df_final["20MA_rest"] = df_final["price_rest"].rolling(20).mean().round(2)
df_final["10MA_ws"] = df_final["price_ws"].rolling(10).mean().round(2)
df_final["20MA_ws"] = df_final["price_ws"].rolling(20).mean().round(2)


In [44]:
df_final


Unnamed: 0,timestamp,price_rest,price_ws,10MA_rest,20MA_rest,10MA_ws,20MA_ws
0,2025-11-22 19:42:05,84515.96,84515.96,,,,
1,2025-11-22 19:42:06,84502.85,84502.85,,,,
2,2025-11-22 19:42:07,84502.85,84502.85,,,,
3,2025-11-22 19:42:08,84502.85,84502.85,,,,
4,2025-11-22 19:42:09,84486.29,84486.29,,,,
5,2025-11-22 19:42:10,84485.48,84485.48,,,,
6,2025-11-22 19:42:11,84483.66,84483.66,,,,
7,2025-11-22 19:42:12,84483.66,84483.66,,,,
8,2025-11-22 19:42:13,84483.66,84483.66,,,,
9,2025-11-22 19:42:14,84488.0,84488.0,84493.53,,84493.53,
