In [None]:
#from telegram_toolchain.data.database import get_conn
from telegram_data_models import Message, Chat, MessageTextContent, Queue
from dotenv import load_dotenv
load_dotenv()   # loads .env from cwd (or parents)
load_dotenv("../../credentials/credentials.env")
from sqlalchemy import select, func, case, create_engine
from tqdm.auto import tqdm  # works in both notebooks & terminals
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pandas as pd
import os
import time
from pathlib import Path
import json
%pip install duckdb
import duckdb
con = duckdb.connect()
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from matplotlib.ticker import LogLocator, LogFormatterMathtext, NullFormatter

from cmcrameri import cm
plt.style.use('../../resources/mpl_styles/default.mplstyle')

In [None]:
edges_schema = con.execute(
    """
    DESCRIBE SELECT *
    FROM read_parquet('../../data/edges_sorted.parquet')
    """
).fetchdf()

print("edges_sorted.parquet schema:")
print(edges_schema)

## Check if "primary" and "lang" are perfectly tied to each (src,dst) pair, with no exceptions


In [None]:
import duckdb

con = duckdb.connect()

path = "../../data/edges_sorted.parquet"

conflicts = con.execute(
    f"""
WITH per_key AS (
  SELECT
    src,
    dst,
    src_is_chat,
    COUNT(DISTINCT lang)    AS n_lang,
    COUNT(DISTINCT "primary") AS n_primary
  FROM read_parquet('{path}')
  GROUP BY src, dst, src_is_chat
)
SELECT
  SUM(CASE WHEN n_lang    > 1 THEN 1 ELSE 0 END) AS keys_with_lang_conflict,
  SUM(CASE WHEN n_primary > 1 THEN 1 ELSE 0 END) AS keys_with_primary_conflict
FROM per_key;
"""
).fetchdf()

print(conflicts)

In [None]:
out_path = "../../data/weighted_edges.parquet"

con.execute(
    f"""
COPY (
  SELECT
    src,
    dst,
    src_is_chat,
    ANY_VALUE(lang)        AS lang,
    ANY_VALUE("primary")   AS "primary",
    COUNT(*)::BIGINT       AS weight
  FROM read_parquet('{path}')
  GROUP BY src, dst, src_is_chat
)
TO '{out_path}'
(FORMAT PARQUET, COMPRESSION ZSTD);
"""
)
print("Wrote:", out_path)

In [None]:
# Database setup
db_user = os.environ.get("DB_USER")
db_pass = os.environ.get("DB_PASSWORD")
db_host = os.environ.get("DB_HOST")
db_port = os.environ.get("DB_PORT")
db_name = os.environ.get("DB_NAME")

db_url = f'postgresql+psycopg2://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}'

# Dask can't work with ORM models
message_table = Message.__table__
chat_table = Chat.__table__
queue_table = Queue.__table__

In [None]:
engine = create_engine(
    db_url,
    pool_pre_ping=True,  # good for long streaming jobs
    future=True,
)

In [None]:
df_lang = pd.read_parquet("../../data/chat_languages.parquet")
if df_lang.index.name == "chat_id" and "chat_id" not in df_lang.columns:
    df_lang = df_lang.reset_index()

In [None]:
time_window = "0"

if time_window == "1 hour":
    delta_seconds = 60 * 60
elif time_window == "1 day":
    delta_seconds = 60 * 60 * 24
elif time_window == "1 week":
    delta_seconds = 60 * 60 * 24 * 7
elif time_window == "0":
    delta_seconds = 0
elif time_window == "inf":
    delta_seconds = 1e9
else:
    raise ValueError("Unknown time window")

df = con.execute(
    f"""
WITH base AS (
  SELECT src, sender, ts
  FROM read_parquet('../../data/edges_sorted.parquet')
  WHERE ts IS NOT NULL
),
with_prev AS (
  SELECT
    src,
    sender,
    ts,
    lag(ts) OVER (PARTITION BY src, sender ORDER BY ts) AS prev_ts
  FROM base
),
outdeg AS (
  SELECT
    src,
    CAST(SUM(
      CASE
        WHEN prev_ts IS NULL THEN 1
        WHEN ts - prev_ts > INTERVAL '{delta_seconds} seconds' THEN 1
        ELSE 0
      END
    ) AS BIGINT) AS out_degree
  FROM with_prev
  GROUP BY src
)
SELECT
  o.src,
  o.out_degree,
  c.true_out_deg,
  c.rank
FROM outdeg o
LEFT JOIN read_parquet('../../data/rank_degree.parquet') c
  ON o.src = c.chat_id
ORDER BY o.out_degree DESC
"""
).df()