# InvestLLM - Data Exploration (EDA)

This notebook explores the dataset collected in Phase 1:
1. Price Data (3000+ stocks, 20 years)
2. Fundamental Data
3. News Corpus (FinGPT + HF)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path

# Settings
DATA_DIR = Path("../data")
pd.set_option("display.max_columns", None)
plt.style.use('fivethirtyeight')

: 

## 1. Load Price Data

In [None]:
# Load a sample stock (RELIANCE)
df_rel = pd.read_parquet(DATA_DIR / "raw/prices/RELIANCE.parquet")
print(f"Shape: {df_rel.shape}")
df_rel.head()

: 

In [None]:
# Plot closing price
plt.figure(figsize=(15, 6))
plt.plot(df_rel['timestamp'], df_rel['close'], label='RELIANCE')
plt.title('Reliance Industries - 20 Year History')
plt.xlabel('Date')
plt.ylabel('Price (INR)')
plt.legend()
plt.show()

## 2. Load Fundamental Data

In [None]:
df_fund = pd.read_csv(DATA_DIR / "fundamentals/fundamentals_2026-01-26.csv")
df_fund.head()

In [None]:
# Top 10 by Market Cap
df_fund.nlargest(10, 'market_cap')[['symbol', 'name', 'market_cap', 'pe_trailing']]

## 3. Load News Data (Sentiment)

In [None]:
df_news = pd.read_parquet(DATA_DIR / "fingpt/datasets/fingpt-sentiment-train.parquet")
print(f"News Samples: {len(df_news)}")
df_news.head()

In [None]:
# Visualize Sentiment Distribution (if label column exists)
if 'output' in df_news.columns:
    df_news['output'].value_counts().plot(kind='bar', figsize=(10, 5))
    plt.title('Sentiment Distribution')
    plt.show()