In [None]:
# from telegram_toolchain.data.database import get_conn
from telegram_data_models import Message, Chat, MessageTextContent, Queue
from dotenv import load_dotenv

load_dotenv()  # loads .env from cwd (or parents)
load_dotenv("credentials.env")
from sqlalchemy import select, func, case, create_engine
from tqdm.auto import tqdm  # works in both notebooks & terminals
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pandas as pd
import os
import time
from pathlib import Path
import json
import numpy as np

In [None]:
# Database setup
db_user = os.environ.get("DB_USER")
db_pass = os.environ.get("DB_PASSWORD")
db_host = os.environ.get("DB_HOST")
db_port = os.environ.get("DB_PORT")
db_name = os.environ.get("DB_NAME")

db_url = f'postgresql+psycopg2://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}'

# Dask can't work with ORM models
message_table = Message.__table__
chat_table = Chat.__table__
queue_table = Queue.__table__

In [None]:
db_url = f'postgresql+psycopg2://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}'

# Dask can't work with ORM models
message_table = Message.__table__
chat_table = Chat.__table__
queue_table = Queue.__table__

In [None]:
engine = create_engine(
    db_url,
    pool_pre_ping=True,  # good for long streaming jobs
    future=True,
)

In [None]:
df_lang = pd.read_parquet("../../data/chat_languages.parquet")
if df_lang.index.name == "chat_id" and "chat_id" not in df_lang.columns:
    df_lang = df_lang.reset_index()

In [None]:
# Base dataframe: one row per chat_id
df_rank = df_lang[["chat_id"]].drop_duplicates().copy()

In [None]:
stmt_rank = select(
    queue_table.c.chat_id,
    queue_table.c.started,
)

df_queue = pd.read_sql(stmt_rank, engine)

# Sort so NULLs are last (Postgres already does this by default, but be explicit)
df_queue = df_queue.sort_values(by="started", ascending=True, na_position="last")

# Assign ranks only to non-null started
df_queue["rank"] = np.where(df_queue["started"].isna(), np.nan, range(1, len(df_queue) + 1))

df_rank = df_rank.merge(df_queue[["chat_id", "rank"]], on="chat_id", how="left")

In [None]:
chat_ids = df_rank["chat_id"].unique().tolist()
chunk_size = 50_000

agg_dfs = []

for start in tqdm(range(0, len(chat_ids), chunk_size), desc="Aggregating true_out_deg"):
    chunk = chat_ids[start : start + chunk_size]

    stmt = (
        select(
            message_table.c.chat_id.label("chat_id"),
            func.sum(
                case(
                    (message_table.c.forward_from_chat_id.is_(None), message_table.c.forwards),
                    else_=0,
                )
            ).label("true_out_deg"),
        )
        .where(message_table.c.chat_id.in_(chunk))
        .group_by(message_table.c.chat_id)
    )

    df_chunk = pd.read_sql(stmt, engine)
    agg_dfs.append(df_chunk)

df_deg = pd.concat(agg_dfs, ignore_index=True)

In [None]:
df_rank = df_rank.merge(df_deg, on="chat_id", how="left")

df_rank["true_out_deg"] = df_rank["true_out_deg"].fillna(0).astype("int64")

In [None]:
output_path = Path("../../data/rank_degree.parquet")
df_rank.to_parquet(output_path, index=False)

print(f"Saved {len(df_rank)} rows to {output_path}")