In [1]:
!pip install statsmodels


Defaulting to user installation because normal site-packages is not writeable




In [2]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.formula.api as smf
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, roc_auc_score, classification_report
import joblib

In [3]:
# Configure plotting
plt.rcParams.update({'figure.max_open_warning': 0})

#  Paths & output folder 
DATA_TRADE = Path("C:\\Users\\pranitha\\Desktop\\ASSIGNMENTS\\trade market analysis\\historical_data.csv")
DATA_SENT = Path("C:\\Users\\pranitha\\Desktop\\ASSIGNMENTS\\trade market analysis\\fear_greed_index.csv")
OUTPUT_DIR = Path('./output')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [4]:
# helper functions

def save_fig(fname):
    
    path = OUTPUT_DIR / fname
    plt.tight_layout()
    plt.savefig(path, dpi=150)
    print(f"Saved figure: {path}")

In [5]:
# Load data
trades = pd.read_csv(DATA_TRADE)
sent = pd.read_csv(DATA_SENT)
print('Trades:', trades.shape)
print('Sentiment:', sent.shape)

Trades: (211224, 16)
Sentiment: (2644, 4)


In [6]:
# Quick peek (first rows)
print(trades.columns.tolist())
print(sent.columns.tolist())

['Account', 'Coin', 'Execution Price', 'Size Tokens', 'Size USD', 'Side', 'Timestamp IST', 'Start Position', 'Direction', 'Closed PnL', 'Transaction Hash', 'Order ID', 'Crossed', 'Fee', 'Trade ID', 'Timestamp']
['timestamp', 'value', 'classification', 'date']


In [7]:
# Clean & Normalize 
# Standardize trade column names
trades.columns = [c.strip() for c in trades.columns]

In [8]:
# Parse trade timestamp (Timestamp IST appears to be like '02-12-2024 22:50')
trades['timestamp_dt'] = pd.to_datetime(
    trades['Timestamp IST'], format='%d-%m-%Y %H:%M', errors='coerce'
)

# Fallback: if a unix or other timestamp column exists, try 'Timestamp'
if trades['timestamp_dt'].isna().sum() > 0 and 'Timestamp' in trades.columns:
    try:
        trades['timestamp_dt'] = pd.to_datetime(
            trades['Timestamp'], unit='s', errors='coerce'
        )
    except Exception:
        pass


In [9]:
# date column for daily aggregation
trades['date'] = trades['timestamp_dt'].dt.date

In [10]:
# Normalize sentiment
sent['date'] = pd.to_datetime(sent['date']).dt.date
# Ensure classification column exists
if 'classification' not in sent.columns:
    sent['classification'] = sent['value'].apply(lambda v: 'Neutral')

In [11]:
# Map an ordered sentiment_score if not present
classification_map = {
'Extreme Fear': 0,
'Fear': 1,
'Neutral': 2,
'Greed': 3,
'Extreme Greed': 4
}

In [12]:
# Create numeric sentiment if missing
sent['sentiment_score'] = sent['classification'].map(classification_map)
# Use bucketed `value` if classification missing
sent['sentiment_score'] = sent['sentiment_score'].fillna((sent['value'] // 20).astype(int))

In [13]:
# Feature engineering (per trade)
# Ensure numeric columns are numeric
for c in ['Execution Price', 'Size Tokens', 'Size USD', 'Closed PnL', 'Fee']:
    if c in trades.columns:
       trades[c] = pd.to_numeric(trades[c], errors='coerce')

In [14]:
# Notional
if 'Execution Price' in trades.columns and 'Size Tokens' in trades.columns:
    trades['notional_usd'] = trades['Execution Price'] * trades['Size Tokens']
else:
     trades['notional_usd'] = trades.get('Size USD', np.nan)

In [15]:
# Profit flag
trades['profit_flag'] = (trades.get('Closed PnL', 0) > 0).astype(int)

In [16]:
# Direction normalization
if 'Direction' in trades.columns:
    trades['direction'] = trades['Direction'].str.upper().str.strip()
elif 'Side' in trades.columns:
    trades['direction'] = trades['Side'].str.upper().str.strip()

In [17]:
# 4) Aggregation: account x date 
agg = trades.groupby(['Account', 'date']).agg(
trade_count=('Closed PnL', 'count'),
sum_pnl=('Closed PnL', 'sum'),
avg_pnl=('Closed PnL', 'mean'),
median_pnl=('Closed PnL', 'median'),
win_rate=('profit_flag', 'mean'),
total_fee=('Fee', 'sum'),
avg_size_usd=('Size USD', 'mean'),
total_notional=('notional_usd', 'sum')
).reset_index()

In [18]:
# Save intermediate cleaned files
agg.to_csv(OUTPUT_DIR / 'daily_trader_metrics.csv', index=False)
print('Saved daily_trader_metrics.csv')

Saved daily_trader_metrics.csv


In [19]:
# 5) Merge with sentiment 
merged = agg.merge(sent[['date', 'value', 'classification', 'sentiment_score']], on='date', how='left')
print('Merged shape:', merged.shape)
merged.to_csv(OUTPUT_DIR / 'merged_trader_sentiment.csv', index=False)
print('Saved merged_trader_sentiment.csv')

Merged shape: (2341, 13)
Saved merged_trader_sentiment.csv


In [20]:
# 6) Exploratory Data Analysis (EDA)
# Basic distributions
plt.figure(figsize=(8,4))
sns.histplot(merged['sum_pnl'].dropna(), bins=80, log_scale=(False, True))
plt.title('Distribution of sum_pnl (account-day)')
save_fig('dist_sum_pnl.png')
plt.close()

Saved figure: output\dist_sum_pnl.png


In [21]:
# Boxplot: avg_pnl by sentiment classification
plt.figure(figsize=(8,5))
sns.boxplot(x='classification', y='avg_pnl', data=merged)
plt.title('Average PnL per trade by Sentiment Classification')
plt.xlabel('Sentiment')
plt.ylabel('Avg PnL (USD)')
save_fig('box_avg_pnl_by_sentiment.png')
plt.close()

Saved figure: output\box_avg_pnl_by_sentiment.png


In [22]:
# Time series: global daily avg sum_pnl vs sentiment value
daily = merged.groupby('date').agg(
mean_sum_pnl=('sum_pnl', 'mean'),
median_sum_pnl=('sum_pnl', 'median'),
mean_sentiment=('value', 'mean')
).reset_index()


plt.figure(figsize=(12,5))
plt.plot(daily['date'], daily['mean_sum_pnl'], label='Mean sum_pnl')
plt.plot(daily['date'], daily['mean_sentiment']* (daily['mean_sum_pnl'].abs().max() / 100), label='Sentiment (scaled)')
plt.legend()
plt.title('Mean daily sum_pnl vs Fear/Greed Index (scaled)')
save_fig('timeseries_mean_pnl_vs_sentiment.png')
plt.close()

Saved figure: output\timeseries_mean_pnl_vs_sentiment.png


In [23]:
# Correlations
corrs = merged[['sum_pnl', 'avg_pnl', 'win_rate', 'total_notional', 'value']].corr()
print('Correlations with sentiment (value):')
print(corrs['value'].sort_values(ascending=False))

Correlations with sentiment (value):
value             1.000000
avg_pnl           0.030970
win_rate          0.026899
sum_pnl           0.000179
total_notional   -0.074367
Name: value, dtype: float64


In [24]:
# 7) Statistical tests 
# Compare avg_pnl for Fear vs Greed (non-parametric)
fear = merged.loc[merged['classification'].str.contains('Fear', na=False), 'avg_pnl'].dropna()
greed = merged.loc[merged['classification'].str.contains('Greed', na=False), 'avg_pnl'].dropna()

if len(fear) > 0 and len(greed) > 0:
    stat, p = stats.mannwhitneyu(fear, greed, alternative='two-sided')
    print(f"Mann-Whitney U Test: stat={stat:.3f}, p={p:.3f}")
else:
    print("Not enough data for Mann-Whitney U Test")



Mann-Whitney U Test: stat=433567.000, p=0.013


In [25]:
#  8 Regression: Does sentiment predict sum_pnl controlling for trade_count & total_notional ---
# We'll use a simple OLS with account fixed effects
try:
    merged['log_total_notional'] = np.log1p(merged['total_notional'].abs())
    model = smf.ols(
        'sum_pnl ~ value + trade_count + log_total_notional + C(Account)',
        data=merged
    ).fit(cov_type='HC3', maxiter=100)

    print(model.summary().tables[1])

    # Save regression summary
    with open(OUTPUT_DIR / 'regression_summary.txt', 'w') as f:
        f.write(model.summary().as_text())
        print('Saved regression summary')

except Exception as e:
    print('Regression failed:', e)

                                                               coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------------------------------
Intercept                                                 4.696e+04   3.17e+04      1.483      0.138   -1.51e+04    1.09e+05
C(Account)[T.0x23e7a7f8d14b550961925fbfdaa92f5d195ba5bd] -6.212e+04   3.11e+04     -1.999      0.046   -1.23e+05   -1224.973
C(Account)[T.0x271b280974205ca63b716753467d5a371de622ab] -7.291e+04   3.33e+04     -2.187      0.029   -1.38e+05   -7561.860
C(Account)[T.0x28736f43f1e871e6aa8b1148d38d4994275d72c4] -5.988e+04    3.1e+04     -1.934      0.053   -1.21e+05     807.990
C(Account)[T.0x2c229d22b100a7beb69122eed721cee9b24011dd]  -5.88e+04   3.08e+04     -1.907      0.056   -1.19e+05    1624.225
C(Account)[T.0x3998f134d6aaa2b6a5f723806d00fd2bbbbce891]  -5.65e+04    3.1e+04     -1.820      0.069   -1.17e+05    4343.307


In [26]:
# 9) Predictive modeling (classification: profitable account-day) 
# Prepare dataset
merged['profitable_day'] = (merged['sum_pnl'] > 0).astype(int)
model_df = merged.dropna(subset=['value'])
features = ['value', 'trade_count', 'log_total_notional', 'avg_size_usd', 'win_rate']
model_df = model_df.fillna(0)
X = model_df[features]
y = model_df['profitable_day']

In [27]:
# Train-test split stratified by date to avoid leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [28]:
# Evaluate
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:,1]
print(classification_report(y_test, y_pred))
print('ROC AUC:', roc_auc_score(y_test, y_proba))

              precision    recall  f1-score   support

           0       0.96      0.89      0.92       175
           1       0.93      0.98      0.95       293

    accuracy                           0.94       468
   macro avg       0.95      0.93      0.94       468
weighted avg       0.94      0.94      0.94       468

ROC AUC: 0.9680448561677231


In [29]:
# Feature importance
imp = pd.Series(clf.feature_importances_, index=features).sort_values(ascending=False)
imp.to_csv(OUTPUT_DIR / 'feature_importances.csv')
print('Saved feature importances')

Saved feature importances


In [30]:
# 10) Clustering / Trader Segmentation 
# Create per-account features across their history
acct_feats = merged.groupby('Account').agg(
mean_sum_pnl=('sum_pnl','mean'),
mean_win_rate=('win_rate','mean'),
mean_trade_count=('trade_count','mean'),
sensitivity_to_sentiment=('sum_pnl', lambda x: np.corrcoef(x, merged.loc[x.index,'value'])[0,1] if len(x)>5 else 0)
).reset_index().fillna(0)

In [31]:
# KMeans clustering
k = 4
scaler = StandardScaler()
Z = scaler.fit_transform(acct_feats[['mean_sum_pnl','mean_win_rate','mean_trade_count','sensitivity_to_sentiment']])
km = KMeans(n_clusters=k, random_state=42)
acct_feats['cluster'] = km.fit_predict(Z)
acct_feats.to_csv(OUTPUT_DIR / 'account_segments.csv', index=False)
print('Saved account_segments.csv')

Saved account_segments.csv


In [32]:
# Placeholder for slides (manual step recommended)
slides_placeholder = OUTPUT_DIR / 'SLIDES_PLACEHOLDER.txt'
with open(slides_placeholder, 'w') as f:
    f.write('Create a 6-slide PDF summarizing: objective, datasets, methods, 3 key findings, recommendations, next steps')
print('Saved slides placeholder')


Saved slides placeholder


In [33]:
# --- 12) Save workspace (models) ---
joblib.dump(clf, OUTPUT_DIR / 'rf_profitable_day_model.joblib')
print('Saved RF classifier')


Saved RF classifier


In [34]:
# Final print
print('\nAll steps executed. Check the ./output/ folder for files.\n')


All steps executed. Check the ./output/ folder for files.

