In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [3]:

print("="*80)
print("CRYPTO TRADING SENTIMENT ANALYSIS - DATA EXPLORATION")
print("="*80)

# ============================================================================
# 1. LOAD DATASETS
# ============================================================================
print("\n[1] LOADING DATASETS...")
print("-"*80)

# Update these paths to where you've saved your CSV files
trader_data_path = 'historical_data.csv'  # Update this path
sentiment_data_path = 'fear_greed_index.csv'     # Update this path

try:
    # Load Historical Trader Data
    df_trader = pd.read_csv(trader_data_path)
    print(f"‚úì Historical Trader Data loaded successfully!")
    print(f"  Shape: {df_trader.shape[0]:,} rows √ó {df_trader.shape[1]} columns")
    
    # Load Fear & Greed Index Data
    df_sentiment = pd.read_csv(sentiment_data_path)
    print(f"‚úì Fear & Greed Index loaded successfully!")
    print(f"  Shape: {df_sentiment.shape[0]:,} rows √ó {df_sentiment.shape[1]} columns")
    
except FileNotFoundError as e:
    print(f"‚ùå Error: {e}")
    print("\nPlease update the file paths in the script to match your file locations.")
    exit()


CRYPTO TRADING SENTIMENT ANALYSIS - DATA EXPLORATION

[1] LOADING DATASETS...
--------------------------------------------------------------------------------
‚úì Historical Trader Data loaded successfully!
  Shape: 211,224 rows √ó 16 columns
‚úì Fear & Greed Index loaded successfully!
  Shape: 2,644 rows √ó 4 columns


## Insights
- Successfully loaded 211,224 trader transactions and 2,644 sentiment records
- Trader data is ~80x larger (transaction-level vs daily-level data)
- No loading errors - datasets are accessible and properly formatted


In [4]:

# ============================================================================
# 2. HISTORICAL TRADER DATA - INITIAL EXPLORATION
# ============================================================================
print("\n" + "="*80)
print("[2] HISTORICAL TRADER DATA - OVERVIEW")
print("="*80)

print("\n--- Column Names and Data Types ---")
print(df_trader.dtypes)

print("\n--- First 5 Rows ---")
print(df_trader.head())

print("\n--- Last 5 Rows ---")
print(df_trader.tail())

print("\n--- Dataset Info ---")
df_trader.info()

print("\n--- Basic Statistics ---")
print(df_trader.describe())

print("\n--- Missing Values ---")
missing_trader = df_trader.isnull().sum()
missing_pct_trader = (df_trader.isnull().sum() / len(df_trader)) * 100
missing_df_trader = pd.DataFrame({
    'Missing_Count': missing_trader,
    'Percentage': missing_pct_trader
})
print(missing_df_trader[missing_df_trader['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False))


[2] HISTORICAL TRADER DATA - OVERVIEW

--- Column Names and Data Types ---
Account              object
Coin                 object
Execution Price     float64
Size Tokens         float64
Size USD            float64
Side                 object
Timestamp IST        object
Start Position      float64
Direction            object
Closed PnL          float64
Transaction Hash     object
Order ID              int64
Crossed                bool
Fee                 float64
Trade ID            float64
Timestamp           float64
dtype: object

--- First 5 Rows ---
                                      Account  Coin  Execution Price  \
0  0xae5eacaf9c6b9111fd53034a602c192a04e082ed  @107           7.9769   
1  0xae5eacaf9c6b9111fd53034a602c192a04e082ed  @107           7.9800   
2  0xae5eacaf9c6b9111fd53034a602c192a04e082ed  @107           7.9855   
3  0xae5eacaf9c6b9111fd53034a602c192a04e082ed  @107           7.9874   
4  0xae5eacaf9c6b9111fd53034a602c192a04e082ed  @107           7.9894   

   Size

## Insights

**Data Types & Structure:**
- 16 columns with mixed types (numeric, text, boolean, timestamps)
- Zero missing values - exceptionally clean dataset

**First/Last Rows Pattern:**
- First rows: Trader buying @107 coin with multiple fills in single transaction (all PnL = 0, position opening)
- Last rows: Trader closing FARTCOIN position at a loss (~$400 total loss across 5 fills)
- Shows typical lifecycle: position open ‚Üí position close with realized PnL

**Statistical Red Flags:**
- **Extreme skewness**: Mean execution price ($11,414) is 625x higher than median ($18) - dominated by low-cap coins with some high-value outliers
- **Position sizing**: Median trade is 32 tokens ($597) but max is 15.8M tokens ($3.9M) - huge whale activity exists
- **PnL distribution**: Median = $0 but mean = $48.75 - most trades break even, few big winners skew average
- **Net short bias**: Mean start position is negative (-$29,946) - traders collectively favor short positions

**Trading Behavior:**
- 246 different coins traded (high diversification, not BTC-only)
- 32 traders with massive concentration (top trader = 19% of all volume)
- Nearly balanced buy/sell ratio (48.6% vs 51.4%)


In [5]:
# ============================================================================
# 3. FEAR & GREED INDEX - INITIAL EXPLORATION
# ============================================================================
print("\n" + "="*80)
print("[3] FEAR & GREED INDEX - OVERVIEW")
print("="*80)

print("\n--- Column Names and Data Types ---")
print(df_sentiment.dtypes)

print("\n--- First 5 Rows ---")
print(df_sentiment.head())

print("\n--- Last 5 Rows ---")
print(df_sentiment.tail())

print("\n--- Dataset Info ---")
df_sentiment.info()

print("\n--- Basic Statistics ---")
print(df_sentiment.describe())

print("\n--- Missing Values ---")
missing_sentiment = df_sentiment.isnull().sum()
missing_pct_sentiment = (df_sentiment.isnull().sum() / len(df_sentiment)) * 100
missing_df_sentiment = pd.DataFrame({
    'Missing_Count': missing_sentiment,
    'Percentage': missing_pct_sentiment
})
print(missing_df_sentiment[missing_df_sentiment['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False))


[3] FEAR & GREED INDEX - OVERVIEW

--- Column Names and Data Types ---
timestamp          int64
value              int64
classification    object
date              object
dtype: object

--- First 5 Rows ---
    timestamp  value classification        date
0  1517463000     30           Fear  2018-02-01
1  1517549400     15   Extreme Fear  2018-02-02
2  1517635800     40           Fear  2018-02-03
3  1517722200     24   Extreme Fear  2018-02-04
4  1517808600     11   Extreme Fear  2018-02-05

--- Last 5 Rows ---
       timestamp  value classification        date
2639  1745818200     54        Neutral  2025-04-28
2640  1745904600     60          Greed  2025-04-29
2641  1745991000     56          Greed  2025-04-30
2642  1746077400     53        Neutral  2025-05-01
2643  1746163800     67          Greed  2025-05-02

--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2644 entries, 0 to 2643
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  

## Insights

**Time Coverage:**
- 7+ years of data (Feb 2018 - May 2025) vs 2 years of trader data
- Perfect overlap exists for meaningful correlation analysis

**Sentiment Distribution:**
- Mean = 47, Median = 46 (near-perfect normal distribution)
- Full scale utilized (5 to 95) - sentiment varies across entire spectrum
- Recent trend: Greed zone (60s) in April/May 2025

**Historical Context:**
- Started during 2018 crypto winter (extreme fear period)
- Latest readings show greed sentiment (market recovery phase)
- No missing daily values - complete time series


In [6]:
# ============================================================================
# 4. KEY COLUMN EXPLORATION - TRADER DATA
# ============================================================================
print("\n" + "="*80)
print("[4] KEY COLUMNS EXPLORATION - TRADER DATA")
print("="*80)

# Check unique values for categorical columns
categorical_cols = df_trader.select_dtypes(include=['object']).columns

for col in categorical_cols:
    unique_count = df_trader[col].nunique()
    print(f"\n--- {col} ---")
    print(f"Unique values: {unique_count}")
    if unique_count <= 20:  # Only show if reasonable number
        print(f"Values: {df_trader[col].unique()}")
    if unique_count <= 50:
        print(f"\nValue counts:")
        print(df_trader[col].value_counts())

# Numeric columns summary
print("\n--- Numeric Columns Summary ---")
numeric_cols = df_trader.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    print(f"\n{col}:")
    print(f"  Min: {df_trader[col].min()}")
    print(f"  Max: {df_trader[col].max()}")
    print(f"  Mean: {df_trader[col].mean():.2f}")
    print(f"  Median: {df_trader[col].median():.2f}")
    print(f"  Std: {df_trader[col].std():.2f}")



[4] KEY COLUMNS EXPLORATION - TRADER DATA

--- Account ---
Unique values: 32

Value counts:
Account
0xbee1707d6b44d4d52bfe19e41f8a828645437aab    40184
0xbaaaf6571ab7d571043ff1e313a9609a10637864    21192
0xa0feb3725a9335f49874d7cd8eaad6be45b27416    15605
0x8477e447846c758f5a675856001ea72298fd9cb5    14998
0xb1231a4a2dd02f2276fa3c5e2a2f3436e6bfed23    14733
0x28736f43f1e871e6aa8b1148d38d4994275d72c4    13311
0x513b8629fe877bb581bf244e326a047b249c4ff1    12236
0x75f7eeb85dc639d5e99c78f95393aa9a5f1170d4     9893
0x47add9a56df66b524d5e2c1993a43cde53b6ed85     8519
0x4f93fead39b70a1824f981a54d4e55b278e9f760     7584
0x23e7a7f8d14b550961925fbfdaa92f5d195ba5bd     7280
0xb899e522b5715391ae1d4f137653e7906c5e2115     4838
0x8170715b3b381dffb7062c0298972d4727a0a63b     4601
0x4acb90e786d897ecffb614dc822eb231b4ffb9f4     4356
0x083384f897ee0f19899168e3b1bec365f52a9012     3818
0x271b280974205ca63b716753467d5a371de622ab     3809
0x39cef799f8b69da1995852eea189df24eb5cae3c     3589
0x2c229d22b100a

## Insights

**Account Concentration:**
- 32 unique traders, but top 1 trader = 40,184 trades (19% of total volume)
- Power law distribution - few whales dominate, many small traders

**Trading Direction Breakdown (Most Critical):**
- **Long bias**: 57% long positions vs 43% short positions
- **Disciplined trading**: Opens nearly match closes (49K long opens vs 48K closes, 39K short opens vs 36K closes)
- **Leverage dominant**: 83% of trades use leveraged positions, only 17% spot trading
- **Risk events**: 8 auto-deleveraging events + 1 liquidation (risky behavior present)

**Transaction Patterns:**
- 101K unique transaction hashes for 211K trades = 2.08 fills per transaction (partial order fills common)
- 27,977 unique timestamps = ~7.5 trades per minute (high-frequency activity)

**Risk Metrics:**
- Biggest single loss: -$117,990
- Biggest single win: +$135,329
- Standard deviation of PnL ($919) >> mean ($48.75) - highly volatile outcomes

In [7]:

# ============================================================================
# 5. KEY COLUMN EXPLORATION - SENTIMENT DATA
# ============================================================================
print("\n" + "="*80)
print("[5] KEY COLUMNS EXPLORATION - SENTIMENT DATA")
print("="*80)

# Check sentiment classification
if 'Classification' in df_sentiment.columns:
    print("\n--- Classification Distribution ---")
    print(df_sentiment['Classification'].value_counts())
    print(f"\nPercentage distribution:")
    print(df_sentiment['Classification'].value_counts(normalize=True) * 100)

# Date range
date_columns = [col for col in df_sentiment.columns if 'date' in col.lower() or 'time' in col.lower()]
if date_columns:
    print(f"\n--- Date Range ---")
    for col in date_columns:
        print(f"{col}:")
        print(f"  First date: {df_sentiment[col].min()}")
        print(f"  Last date: {df_sentiment[col].max()}")


[5] KEY COLUMNS EXPLORATION - SENTIMENT DATA

--- Date Range ---
timestamp:
  First date: 1517463000
  Last date: 1746163800
date:
  First date: 2018-02-01
  Last date: 2025-05-02


## Insights


**Date Coverage:**
- Sentiment timestamp range: 2018 to 2025
- Trader timestamp range: 2023 to 2025
- **Analysis window**: March 2023 - April 2025 (2-year overlap)

**Format Compatibility:**
- Both datasets have timestamps that need datetime conversion
- Sentiment has both Unix timestamp and readable date (easy to merge)

In [8]:
# ============================================================================
# 6. DATA QUALITY CHECKS
# ============================================================================
print("\n" + "="*80)
print("[6] DATA QUALITY CHECKS")
print("="*80)

# Check for duplicates
print(f"\nDuplicate rows in Trader Data: {df_trader.duplicated().sum()}")
print(f"Duplicate rows in Sentiment Data: {df_sentiment.duplicated().sum()}")

# Check data types that might need conversion
print("\n--- Date/Time Columns Detection ---")
print("Trader Data columns that might be dates:")
for col in df_trader.columns:
    if 'date' in col.lower() or 'time' in col.lower():
        print(f"  - {col}: {df_trader[col].dtype}")

print("\nSentiment Data columns that might be dates:")
for col in df_sentiment.columns:
    if 'date' in col.lower() or 'time' in col.lower():
        print(f"  - {col}: {df_sentiment[col].dtype}")


[6] DATA QUALITY CHECKS

Duplicate rows in Trader Data: 0
Duplicate rows in Sentiment Data: 0

--- Date/Time Columns Detection ---
Trader Data columns that might be dates:
  - Timestamp IST: object
  - Timestamp: float64

Sentiment Data columns that might be dates:
  - timestamp: int64
  - date: object


## Insights

**Quality Assessment:**
- Zero duplicate rows in both datasets (high data integrity)
- All date/time columns identified for conversion
- Trader data has redundant timestamps (IST string + Unix) - use Unix for consistency

**Merge Strategy Identified:**
- Convert Unix timestamps to dates
- Aggregate trader metrics by day
- Join on date column

In [9]:
# ============================================================================
# 7. INITIAL INSIGHTS
# ============================================================================
print("\n" + "="*80)
print("[7] INITIAL OBSERVATIONS")
print("="*80)

print("\n‚úì Data loaded successfully!")
print(f"\nüìä Dataset Sizes:")
print(f"   - Trader transactions: {len(df_trader):,} records")
print(f"   - Sentiment records: {len(df_sentiment):,} records")

print("\nüîë Key Columns Identified:")
print("   Trader Data:", list(df_trader.columns))
print("   Sentiment Data:", list(df_sentiment.columns))

print("\n" + "="*80)

# Save column names for reference
print("\nüíæ Saving column information...")
with open('column_info.txt', 'w') as f:
    f.write("TRADER DATA COLUMNS:\n")
    f.write("="*50 + "\n")
    for col in df_trader.columns:
        f.write(f"{col}\n")
    f.write("\n\nSENTIMENT DATA COLUMNS:\n")
    f.write("="*50 + "\n")
    for col in df_sentiment.columns:
        f.write(f"{col}\n")
print("‚úì Column information saved to 'column_info.txt'")

print("\n‚úÖ Initial exploration complete!")


[7] INITIAL OBSERVATIONS

‚úì Data loaded successfully!

üìä Dataset Sizes:
   - Trader transactions: 211,224 records
   - Sentiment records: 2,644 records

üîë Key Columns Identified:
   Trader Data: ['Account', 'Coin', 'Execution Price', 'Size Tokens', 'Size USD', 'Side', 'Timestamp IST', 'Start Position', 'Direction', 'Closed PnL', 'Transaction Hash', 'Order ID', 'Crossed', 'Fee', 'Trade ID', 'Timestamp']
   Sentiment Data: ['timestamp', 'value', 'classification', 'date']


üíæ Saving column information...
‚úì Column information saved to 'column_info.txt'

‚úÖ Initial exploration complete!


## Insights

**Scale Comparison:**
- ~80 trader transactions per sentiment reading
- Need daily aggregation: sum PnL, count trades, calculate win rate, etc.

**Analysis Readiness:**
- Datasets are clean and ready for merging
- Next steps clearly defined: date conversion ‚Üí merge ‚Üí feature engineering ‚Üí analysis
- No data cleaning required (rare in real-world scenarios)
