In [1]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Set up plotting style
sns.set_style("whitegrid")

## Load Merged Dataset

In [3]:
df = pd.read_csv('trader_sentiment_merged.csv')

In [4]:
df['date'] = pd.to_datetime(df['date'])

In [5]:
print("--- Merged Dataset Info ---")
df.info()

--- Merged Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 479 entries, 0 to 478
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   date                479 non-null    datetime64[ns]
 1   total_closed_pnl    479 non-null    float64       
 2   avg_closed_pnl      479 non-null    float64       
 3   total_trade_volume  479 non-null    float64       
 4   trade_count         479 non-null    int64         
 5   net_trade_size      479 non-null    float64       
 6   index_value         479 non-null    int64         
 7   classification      479 non-null    object        
dtypes: datetime64[ns](1), float64(4), int64(2), object(1)
memory usage: 30.1+ KB


In [6]:
print("\n--- Merged Dataset Descriptive Statistics ---")
df.describe()


--- Merged Dataset Descriptive Statistics ---


Unnamed: 0,date,total_closed_pnl,avg_closed_pnl,total_trade_volume,trade_count,net_trade_size,index_value
count,479,479.0,479.0,479.0,479.0,479.0,479.0
mean,2024-08-25 13:55:44.467640832,21408.114717,45.375147,2486636.0,440.956159,1449.386,60.05428
min,2023-05-01 00:00:00,-419020.225731,-965.921267,0.1098175,1.0,-16097040.0,10.0
25%,2024-04-22 12:00:00,5.357891,0.236375,98534.96,18.5,-15667.05,48.0
50%,2024-08-25 00:00:00,1118.387284,17.888969,327218.0,68.0,0.0,67.0
75%,2025-01-01 12:00:00,10629.856994,58.65226,1649486.0,521.0,11152.21,74.0
max,2025-05-01 00:00:00,616413.032233,1572.416058,55914620.0,6246.0,16200410.0,94.0
std,,71930.154661,145.465944,6290452.0,810.351623,1183120.0,18.687621


## Feature Engineering

In [7]:
# 1. Lagged Fear/Greed Index
# To see if yesterday's sentiment affects today's performance
df['lag_index_value'] = df['index_value'].shift(1)
df['lag_classification'] = df['classification'].shift(1)

In [8]:
# 2. Daily PnL per Trade
df['pnl_per_trade'] = df['total_closed_pnl'] / df['trade_count']

In [9]:
# 3. PnL per Volume
df['pnl_per_volume'] = df['total_closed_pnl'] / df['total_trade_volume']

In [10]:
# 4. Binary Sentiment (Fear vs. Greed)
# Fear (0-49), Greed (50-100)
df['is_greed'] = (df['index_value'] >= 50).astype(int)

In [11]:
# 5. Volatility/Change in Sentiment
df['sentiment_change'] = df['index_value'].diff()

In [12]:
# Drop the first row which will have NaN for lagged features
df.dropna(inplace=True)

In [13]:
print("\n--- Dataset after Feature Engineering (First 5 rows) ---")
df.head()


--- Dataset after Feature Engineering (First 5 rows) ---


Unnamed: 0,date,total_closed_pnl,avg_closed_pnl,total_trade_volume,trade_count,net_trade_size,index_value,classification,lag_index_value,lag_classification,pnl_per_trade,pnl_per_volume,is_greed,sentiment_change
1,2023-12-05,0.0,0.0,50005.82917,9,10.61479,75,Extreme Greed,63.0,Greed,0.0,0.0,1,12.0
2,2023-12-14,-205.434737,-18.675885,113203.34547,11,-3568.61479,72,Greed,75.0,Extreme Greed,-18.675885,-0.001815,1,-3.0
3,2023-12-15,-24.632034,-12.316017,10609.956,2,3558.0,70,Greed,72.0,Greed,-12.316017,-0.002322,1,-2.0
4,2023-12-16,0.0,0.0,15348.76138,3,39897.0,67,Greed,70.0,Greed,0.0,0.0,1,-3.0
5,2023-12-17,304.982785,21.784485,116278.018081,14,-39897.0,73,Greed,67.0,Greed,21.784485,0.002623,1,6.0


In [14]:
print(f"\nTotal rows after dropping NaN: {len(df)}")


Total rows after dropping NaN: 478


## Exploratory Data Analysis (EDA) 

In [15]:
def plot_distribution(data, column, title, filename):
    plt.figure(figsize=(8, 5))
    sns.histplot(data[column], kde=True)
    plt.title(title)
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

In [16]:
plot_distribution(df, 'index_value', 'Distribution of Fear & Greed Index Value', 'fg_index_distribution.png')

In [17]:
plot_distribution(df, 'total_closed_pnl', 'Distribution of Total Daily Closed PnL', 'total_pnl_distribution.png')

In [18]:
plot_distribution(df, 'pnl_per_trade', 'Distribution of PnL per Trade', 'pnl_per_trade_distribution.png')

In [19]:
# 2. Time Series of Key Variables
def plot_time_series(data, column, title, filename):
    plt.figure(figsize=(12, 6))
    plt.plot(data['date'], data[column])
    plt.title(title)
    plt.xlabel('Date')
    plt.ylabel(column)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

In [20]:
plot_time_series(df, 'index_value', 'Fear & Greed Index Over Time', 'fg_index_time_series.png')

In [21]:
plot_time_series(df, 'total_closed_pnl', 'Total Daily Closed PnL Over Time', 'total_pnl_time_series.png')

In [22]:
# 3. Correlation Matrix
# Select numerical columns for correlation analysis
numerical_cols = [
    'index_value', 'total_closed_pnl', 'avg_closed_pnl', 
    'total_trade_volume', 'trade_count', 'net_trade_size', 
    'lag_index_value', 'pnl_per_trade', 'pnl_per_volume', 
    'is_greed', 'sentiment_change'
]
corr_matrix = df[numerical_cols].corr()

In [23]:
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
plt.title('Correlation Matrix of Trader Performance and Sentiment Features')
plt.tight_layout()
plt.savefig('correlation_matrix.png')
plt.close()

In [24]:
print("\nEDA and Feature Engineering complete. Visualizations saved.")


EDA and Feature Engineering complete. Visualizations saved.


In [25]:
# Save the final dataframe with new features for the next phase
df.to_csv('trader_sentiment_features.csv', index=False)