In [16]:
pip install duckdb

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import numpy as np
import pandas as pd
import torch
import duckdb


In [1]:
import duckdb

file_path = r"P:\IIM Ranchi\nasdaq_exteral_data.csv"

duckdb_conn = duckdb.connect()

query = f"""
WITH raw AS (
    SELECT *
    FROM read_csv(
        'P:/IIM Ranchi/nasdaq_exteral_data.csv',
        AUTO_DETECT=TRUE,
        IGNORE_ERRORS=TRUE
    )
    WHERE Article IS NOT NULL AND TRIM(Article) <> ''
),
daily_stats AS (
    SELECT 
        Stock_symbol, 
        Date, 
        COUNT(*) as articles_per_day
    FROM raw
    GROUP BY 1, 2
),
qualified_stocks AS (
    SELECT 
        Stock_symbol,
        COUNT(DISTINCT Date) as active_days,
        AVG(articles_per_day) as avg_density
    FROM daily_stats
    GROUP BY 1
    HAVING active_days > 50 
       AND avg_density BETWEEN 1 AND 15
),
sampled_firms AS (
    SELECT Stock_symbol 
    FROM qualified_stocks
    USING SAMPLE reservoir(600)
)
SELECT r.*
FROM raw r
JOIN sampled_firms s 
ON r.Stock_symbol = s.Stock_symbol;
"""

sampled_df = duckdb_conn.execute(query).df()
sampled_df.to_parquet("sampled_35k_news.parquet")

print("Final sampled rows:", len(sampled_df))


Final sampled rows: 358993


In [2]:
import pandas as pd

df = pd.read_parquet("sampled_35k_news.parquet")

print(len(df))


358993


In [3]:
import pandas as pd

df = pd.read_parquet("sampled_35k_news.parquet")

# Keep essential columns — adjust names as needed
expected_cols = ["Date", "Stock_symbol", "Article"]
df = df[[c for c in expected_cols if c in df.columns]]

# Clean missing/empty articles
df = df.dropna(subset=['Article'])
df = df[df['Article'].str.strip() != ""]

# Parse dates
df['Date'] = pd.to_datetime(df['Date'])

# Sort for consistency
df = df.sort_values(["Stock_symbol", "Date"]).reset_index(drop=True)

df.to_parquet("sampled_35k_clean.parquet")
print(df.head())
print("Cleaned size:", len(df))


        Date Stock_symbol                                            Article
0 2010-04-20          AAP  Altera Corp. ( ALTR ), Hudson City Bancorp Inc...
1 2010-05-20          AAP  Among the biggest winners in Thursday's early ...
2 2010-05-20          AAP  Auto parts retailer Advance Auto Parts, Inc. (...
3 2010-05-20          AAP  The market continued to sell off here and we c...
4 2010-10-10          AAP  Working for StreetAuthority, I do a lot of dif...
Cleaned size: 358993


In [4]:
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm
import pandas as pd
import torch

df = pd.read_parquet("sampled_35k_clean.parquet")

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load model on GPU
model = SentenceTransformer(
    "sentence-transformers/all-mpnet-base-v2",
    device=device
)

def embed_batch(texts, batch_size=96):  
    all_embs = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        emb = model.encode(
            batch,
            batch_size=batch_size,
            device=device,
            show_progress_bar=False
        )
        all_embs.append(emb)
    return np.vstack(all_embs)

df["embedding"] = list(embed_batch(df["Article"].tolist()))
df.to_parquet("sampled_35k_embedded.parquet")

print("Done embedding!")


  from .autonotebook import tqdm as notebook_tqdm





100%|██████████| 3740/3740 [5:31:09<00:00,  5.31s/it]      


Done embedding!


In [1]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Current device:", torch.cuda.current_device() if torch.cuda.is_available() else "None")
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")


CUDA available: True
Current device: 0
Device name: NVIDIA GeForce RTX 3050 Ti Laptop GPU


In [3]:
import numpy as np
import pandas as pd
import pyarrow.dataset as ds

# ============================================================
# 0. Config
# ============================================================

PARQUET_PATH = "sampled_35k_embedded.parquet"
OUT_PATH = "daily_embeddings.parquet"

BATCH_SIZE = 2000        # Arrow batch size (safe on Windows)
EMB_DTYPE = np.float32  # halve memory

# ============================================================
# 1. Novelty-weighted aggregation (OOM-safe)
# ============================================================

def novelty_weighted_mean_fast(embs: np.ndarray) -> np.ndarray:
    """
    embs: (n_docs, dim) float32
    returns: (dim,) float32, L2-normalized
    """

    # Single document
    if embs.shape[0] == 1:
        v = embs[0]
        return v / np.linalg.norm(v)

    # Normalize per-document
    embs = embs / np.linalg.norm(embs, axis=1, keepdims=True)

    # Semantic center
    center = embs.mean(axis=0)
    center /= np.linalg.norm(center)

    # Novelty weights
    weights = 1.0 - (embs @ center)
    weights[weights < 0.0] = 0.0

    wsum = weights.sum()
    if wsum == 0.0:
        return center

    out = (embs * weights[:, None]).sum(axis=0) / wsum
    return out / np.linalg.norm(out)


# ============================================================
# 2. Stream parquet safely (NO large allocations)
# ============================================================

dataset = ds.dataset(PARQUET_PATH, format="parquet")

daily_groups = {}  # (stock, date) -> list of embeddings

for batch in dataset.to_batches(batch_size=BATCH_SIZE):
    df = batch.to_pandas()

    # Convert embeddings safely
    df["embedding"] = df["embedding"].apply(
        lambda x: np.asarray(x, dtype=EMB_DTYPE)
    )

    # Accumulate by (Stock_symbol, Date)
    for (stock, date), g in df.groupby(["Stock_symbol", "Date"], sort=False):
        key = (stock, date)
        if key not in daily_groups:
            daily_groups[key] = []
        daily_groups[key].extend(g["embedding"].values)

    # Free batch explicitly (important on Windows)
    del df, batch


# ============================================================
# 3. Compute daily embeddings
# ============================================================

records = []

for (stock, date), embs in daily_groups.items():
    E_t = novelty_weighted_mean_fast(np.vstack(embs))
    records.append((stock, date, E_t))

# Cleanup
del daily_groups

# ============================================================
# 4. Final dataframe
# ============================================================

out_df = pd.DataFrame(
    records,
    columns=["Stock_symbol", "Date", "E_t"]
)

# ============================================================
# 5. Save novelty-weighted daily embeddings
# ============================================================

out_df.to_parquet(
    OUT_PATH,
    engine="pyarrow",
    compression="zstd"
)

print(f"✅ Daily embeddings saved: {OUT_PATH}")



✅ Daily embeddings saved: daily_embeddings.parquet


In [7]:
import numpy as np
import pandas as pd
from numpy.linalg import norm

# Load precomputed daily embeddings
daily_embeddings = pd.read_parquet("daily_embeddings.parquet")

# ----------------------------
# Distance functions
# ----------------------------
def cosine_distance(a, b):
    return 1 - np.dot(a, b) / (norm(a) * norm(b))

def l2_distance(a, b):
    return norm(a - b)

def angular_distance(a, b):
    cos_sim = np.dot(a, b) / (norm(a) * norm(b))
    return np.arccos(np.clip(cos_sim, -1.0, 1.0)) / np.pi  # normalized [0,1]

# ----------------------------
# EMA function
# ----------------------------
def compute_ema(arr, alpha):
    """arr: np.array(T, dim)"""
    ema = arr[0]
    out = [ema]
    for v in arr[1:]:
        ema = alpha * v + (1 - alpha) * ema
        out.append(ema)
    return np.vstack(out)

# ----------------------------
# α variants
# ----------------------------
alphas = [0.1, 0.2, 0.3]
records = []

for stock, g in daily_embeddings.groupby("Stock_symbol"):
    g = g.sort_values("Date")
    E_arr = np.vstack(g["E_t"].values).astype(np.float32)
    
    # Compute μ_{t−1} for all α
    mu_dict = {}
    for alpha in alphas:
        mu_arr = compute_ema(E_arr, alpha)
        mu_arr = np.vstack([mu_arr[0], mu_arr[:-1]])  # enforce t-1
        mu_dict[alpha] = mu_arr
    
    # Compute SSD for all distances and α
    for i, row in g.iterrows():
        e = row["E_t"]
        for alpha, mu_arr in mu_dict.items():
            mu = mu_arr[i - g.index[0]]
            record = {
                "Stock_symbol": row["Stock_symbol"],
                "Date": row["Date"],
                "E_t": e,
                "EMA_alpha": alpha,
                "mu_t_minus_1": mu,
                "SSD_cosine": cosine_distance(e, mu),
                "SSD_l2": l2_distance(e, mu),
                "SSD_angular": angular_distance(e, mu)
            }
            records.append(record)

# Final DataFrame
ssd_variants = pd.DataFrame(records, columns=[
    "Stock_symbol", "Date", "E_t", "EMA_alpha", "mu_t_minus_1",
    "SSD_cosine", "SSD_l2", "SSD_angular"
])

# Save final variants only
ssd_variants.to_parquet("ssd_final_variants.parquet", engine="pyarrow", compression="zstd")
print(f"✅ SSD variants computed: {ssd_variants.shape}")


✅ SSD variants computed: (664404, 8)


In [9]:
import pyarrow.parquet as pq

# Open the Parquet file (does NOT load all data)
table = pq.ParquetFile("ssd_final_variants.parquet")

# Get column names
print(table.schema.names)


['Stock_symbol', 'Date', 'element', 'EMA_alpha', 'element', 'SSD_cosine', 'SSD_l2', 'SSD_angular']


In [None]:
print(df.columns.tolist())
