# Merge Datasets(Trade & Sentiment)

In [1]:
import pandas as pd
import os

## Load Cleaned Datasets

In [4]:
trade_path = "../data/preprocessed/historical_data/cleaned_historical_data.csv"
sentiment_path = "../data/preprocessed/sentiment_data/sentiment_clean.csv"

In [6]:
trades_df = pd.read_csv(trade_path, parse_dates = ['date'])
sentiment_df = pd.read_csv(sentiment_path, parse_dates = ['date'])

## Align Date formats & timezones

### Ensure both are in UTC date format

In [7]:
trades_df['date'] = pd.to_datetime(trades_df['date']).dt.date
sentiment_df['date'] = pd.to_datetime(sentiment_df['date']).dt.date

## Merge datasets on Date

In [8]:
merged_df = pd.merge(trades_df, sentiment_df, on = 'date', how = 'left')

## Handle Missing Sentiment

In [11]:
merged_df = merged_df.sort_values("date")
merged_df['value'] = merged_df['value'].ffill().bfill()
merged_df['classification'] = merged_df['classification'].ffill().bfill()

## Save merged dataset

In [12]:
merged_df.to_csv("../data/Merged Data/merged_trades_sentiment.csv", index=False)

## Merged Dataset Overview

In [13]:
merged_df.shape

(211224, 31)

In [14]:
merged_df.head()

Unnamed: 0,account,coin,execution_price,size_tokens,size_usd,side,timestamp_local,start_position,direction,closed_pnl,...,year,is_weekend,win,abs_size_tokens,abs_size_usd,roi,timestamp_y,value,classification,sentiment_score
83771,0x3998f134d6aaa2b6a5f723806d00fd2bbbbce891,ETH,1898.6,0.0722,137.08,BUY,2023-05-01 01:06:00+05:30,0.1791,Open Long,0.0,...,2023,1,0,0.0722,137.08,0.0,2023-04-30 05:30:00+00:00,60,Greed,75
83770,0x3998f134d6aaa2b6a5f723806d00fd2bbbbce891,ETH,1897.9,0.0824,156.39,BUY,2023-05-01 01:06:00+05:30,0.0967,Open Long,0.0,...,2023,1,0,0.0824,156.39,0.0,2023-04-30 05:30:00+00:00,60,Greed,75
83769,0x3998f134d6aaa2b6a5f723806d00fd2bbbbce891,ETH,1897.9,0.0967,183.53,BUY,2023-05-01 01:06:00+05:30,0.0,Open Long,0.0,...,2023,1,0,0.0967,183.53,0.0,2023-04-30 05:30:00+00:00,60,Greed,75
39062,0xb1231a4a2dd02f2276fa3c5e2a2f3436e6bfed23,BTC,41867.0,0.015,628.0,SELL,2023-12-05 03:11:00+05:30,0.0,Open Short,0.0,...,2023,0,0,0.015,628.0,0.0,2023-12-04 05:30:00+00:00,74,Greed,75
39061,0xb1231a4a2dd02f2276fa3c5e2a2f3436e6bfed23,ETH,2231.0,3.8932,8685.73,BUY,2023-12-05 03:11:00+05:30,7.3187,Open Long,0.0,...,2023,0,0,3.8932,8685.73,0.0,2023-12-04 05:30:00+00:00,74,Greed,75


In [18]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 211224 entries, 83771 to 60888
Data columns (total 31 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   account           211224 non-null  object 
 1   coin              211224 non-null  object 
 2   execution_price   211224 non-null  float64
 3   size_tokens       211224 non-null  float64
 4   size_usd          211224 non-null  float64
 5   side              211224 non-null  object 
 6   timestamp_local   211224 non-null  object 
 7   start_position    211224 non-null  float64
 8   direction         211224 non-null  object 
 9   closed_pnl        211224 non-null  float64
 10  transaction_hash  211224 non-null  object 
 11  order_id          211224 non-null  int64  
 12  crossed           211224 non-null  bool   
 13  fee               211224 non-null  float64
 14  trade_id          211224 non-null  float64
 15  timestamp_x       211224 non-null  float64
 16  timestamp_utc     2112

In [19]:
merged_df.describe()

Unnamed: 0,execution_price,size_tokens,size_usd,start_position,closed_pnl,order_id,fee,trade_id,timestamp_x,hour,weekday,week,year,is_weekend,win,abs_size_tokens,abs_size_usd,roi,value,sentiment_score
count,211224.0,211224.0,211224.0,211224.0,211224.0,211224.0,211224.0,211224.0,211224.0,211224.0,211224.0,211224.0,211224.0,211224.0,211224.0,211224.0,211224.0,211224.0,211224.0,211224.0
mean,11414.72335,4623.365,5639.451,-29946.25,48.749001,69653880000.0,1.163967,562854900000000.0,1737744000000.0,12.2345,2.551405,18.134142,2024.747978,0.192757,0.411265,4623.365,5639.451,0.018991,51.610044,52.948529
std,29447.654868,104272.9,36575.14,673807.4,919.164828,18357530000.0,6.758854,325756500000000.0,8689920000.0,6.960365,1.921552,15.513679,0.437996,0.394465,0.492064,104272.9,36575.14,0.845727,20.981819,32.32678
min,5e-06,8.74e-07,0.0,-14334630.0,-117990.1041,173271100.0,-1.175712,0.0,1680000000000.0,0.0,0.0,1.0,2023.0,0.0,0.0,8.74e-07,0.0,-384.406426,10.0,0.0
25%,4.8547,2.94,193.79,-376.2311,0.0,59838530000.0,0.016121,281000000000000.0,1740000000000.0,6.0,1.0,8.0,2024.0,0.0,0.0,2.94,193.79,0.0,32.0,25.0
50%,18.28,32.0,597.045,84.72793,0.0,74429390000.0,0.089578,562000000000000.0,1740000000000.0,13.0,2.0,13.0,2025.0,0.0,0.0,32.0,597.045,0.0,49.0,50.0
75%,101.58,187.9025,2058.96,9337.278,5.792797,83355430000.0,0.393811,846000000000000.0,1740000000000.0,18.0,4.0,17.0,2025.0,0.0,1.0,187.9025,2058.96,0.010318,72.0,75.0
max,109004.0,15822440.0,3921431.0,30509480.0,135329.0901,90149230000.0,837.471593,1130000000000000.0,1750000000000.0,23.0,6.0,52.0,2025.0,1.0,1.0,15822440.0,3921431.0,3.40355,94.0,100.0
