In [None]:
import sys
from pathlib import Path
import duckdb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import LogLocator, LogFormatterMathtext, NullLocator, NullFormatter

con = duckdb.connect()

path = "data/weighted_edges.parquet"

from telegram_quality_control.visualization import single_col_figure
from cmcrameri import cm

figure_style = "print"
plt.style.use('resources/mpl_styles/default.mplstyle')

In [3]:
deg_all = con.execute(
    f"""
SELECT
  src,
  src_is_chat,
  SUM(weight)::BIGINT AS degree
FROM read_parquet('{path}')
GROUP BY src, src_is_chat
"""
).fetchdf()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [4]:
deg_primary = con.execute(
    f"""
SELECT
  src,
  src_is_chat,
  SUM(weight)::BIGINT AS degree
FROM read_parquet('{path}')
WHERE "primary" = TRUE
GROUP BY src, src_is_chat
"""
).fetchdf()

In [None]:
logbins = np.logspace(0, np.log10(deg_all["degree"].max()), num=100, base=10)

# Compute pdf
# y_all, edges = np.histogram(deg_all["degree"], bins=logbins)
# y_pri, _ = np.histogram(deg_primary["degree"], bins=logbins)

# # compute bin centers
# x_all = edges[:-1] + 0.5 * (edges[1:] - edges[:-1])

# # normalize by bin width
# bin_widths = edges[1:] - edges[:-1]
# y_all = y_all / bin_widths
# y_pri = y_pri / bin_widths

# Compute ccdf
y_all = np.array([np.sum(deg_all[deg_all["src_is_chat"] == 1]["degree"] > e) for e in logbins])
y_pri = np.array(
    [np.sum(deg_primary[deg_primary["src_is_chat"] == 1]["degree"] > e) for e in logbins]
)

In [None]:
fig = single_col_figure(height_frac=0.8)
ax = fig.add_subplot(111)


ax.step(logbins, y_all, 'x', label="Discovered chats", markersize=2)
ax.plot(logbins, y_pri, 'x', label="Downloaded chats", markersize=2)
x_guess = np.array([1e5, 3e6])

ax.set_xscale("log")
ax.set_yscale("log")

ax.set_xlabel("Weighted out-degree $k$")
ax.set_ylabel(f"Chats with out-degree $> k$")

ax.legend()
ax.grid(True, which="both", linestyle='--', alpha=0.5)
ax.set_xlim(1, 10**7)
ax.set_ylim(0.8, 10**7)
ax.xaxis.set_major_locator(LogLocator(base=10, numticks=4))
ax.xaxis.set_minor_locator(LogLocator(base=10, numticks=10))
ax.xaxis.set_minor_formatter(NullFormatter())

ax.yaxis.set_major_locator(LogLocator(base=10, numticks=4))
ax.yaxis.set_minor_locator(LogLocator(base=10, numticks=10))
ax.yaxis.set_minor_formatter(NullFormatter())


fig.tight_layout()
plt.show()

fig.savefig("./figures/degree_ccdf.png", dpi=300)
fig.savefig("./figures/degree_ccdf.pdf", dpi=300)