In [None]:
import duckdb

con = duckdb.connect()

edges_schema = con.execute(
    """
    DESCRIBE SELECT *
    FROM read_parquet('./data/edges_sorted.parquet')
    """
).fetchdf()

print("edges_sorted.parquet schema:")
print(edges_schema)

edges_sorted.parquet schema:
      column_name column_type null   key default extra
0          msg_id     UBIGINT  YES  None    None  None
1             src    UINTEGER  YES  None    None  None
2             dst    UINTEGER  YES  None    None  None
3          sender    UINTEGER  YES  None    None  None
4              ts   TIMESTAMP  YES  None    None  None
5     src_is_chat    UTINYINT  YES  None    None  None
6  sender_is_chat    UTINYINT  YES  None    None  None
7         primary    UTINYINT  YES  None    None  None
8            lang     VARCHAR  YES  None    None  None


In [None]:
lookup_schema = con.execute(
    """
    DESCRIBE SELECT *
    FROM read_parquet('../../data/language_downloaded_chats.parquet')
    """
).fetchdf()

print("language_downloaded_chats schema:")
print(lookup_schema)

language_downloaded_chats schema:
         column_name column_type null   key default extra
0            chat_id      BIGINT  YES  None    None  None
1               lang     VARCHAR  YES  None    None  None
2              score      DOUBLE  YES  None    None  None
3             status     VARCHAR  YES  None    None  None
4       columns_used     VARCHAR  YES  None    None  None
5  __index_level_0__      BIGINT  YES  None    None  None


In [None]:
import duckdb

EDGES = "../../data/edges_sorted.parquet"
LOOKUP = "../../data/language_downloaded_chats.parquet"  # parquet in same dir

con = duckdb.connect()

# 1) Among rows with primary=1, fraction where lang = 'NA'
na_frac = con.execute(
    """
    WITH base AS (
        SELECT lang
        FROM read_parquet(?)
        WHERE "primary" = 1
    )
    SELECT
        AVG(CASE WHEN lang = 'NA' THEN 1.0 ELSE 0.0 END) AS frac_lang_NA,
        COUNT(*) AS n_primary_rows
    FROM base
    """,
    [EDGES],
).fetchdf()

# print("1) primary=1: fraction lang='NA'")
# print(na_frac, end="\n\n")

# 2) Within primary=1 AND lang='NA', fraction where dst language is 'en' and src language is 'en'
#    We join dst/src IDs to the small lookup table on chat_id.
fractions = con.execute(
    """
    WITH edges_na AS (
        SELECT src, dst
        FROM read_parquet(?)
        WHERE "primary" = 1 AND lang = 'NA'
    ),
    chats AS (
        SELECT chat_id, lang AS chat_lang
        FROM read_parquet(?)
    ),
    joined AS (
        SELECT
            e.src,
            e.dst,
            s.chat_lang AS src_lang,
            d.chat_lang AS dst_lang
        FROM edges_na e
        LEFT JOIN chats s ON s.chat_id = e.src
        LEFT JOIN chats d ON d.chat_id = e.dst
    )
    SELECT
        -- denominators:
        COUNT(*) AS n_edges_lang_NA,

        -- fractions over ALL NA edges (missing lookup => not 'en'):
        AVG(CASE WHEN dst_lang = 'en' THEN 1.0 ELSE 0.0 END) AS frac_dst_en_over_all,
        AVG(CASE WHEN src_lang = 'en' THEN 1.0 ELSE 0.0 END) AS frac_src_en_over_all,

        -- optional: fractions only among edges where dst/src language is known in lookup
        AVG(CASE WHEN dst_lang IS NOT NULL THEN (dst_lang = 'en')::DOUBLE ELSE NULL END) AS frac_dst_en_over_known,
        AVG(CASE WHEN src_lang IS NOT NULL THEN (src_lang = 'en')::DOUBLE ELSE NULL END) AS frac_src_en_over_known,

        -- optional: how much coverage you have
        AVG(CASE WHEN dst_lang IS NOT NULL THEN 1.0 ELSE 0.0 END) AS dst_lang_coverage,
        AVG(CASE WHEN src_lang IS NOT NULL THEN 1.0 ELSE 0.0 END) AS src_lang_coverage
    FROM joined
    """,
    [EDGES, LOOKUP],
).fetchdf()

# print("2) Within primary=1 AND lang='NA':")
# print("   - fraction dst language is 'en'")
# print("   - fraction src language is 'en'")
# print(fractions)

In [None]:
# ---- extract numbers from results ----
frac_na = na_frac.loc[0, "frac_lang_NA"]
n_primary = int(na_frac.loc[0, "n_primary_rows"])

n_na = int(fractions.loc[0, "n_edges_lang_NA"])

frac_dst_en_all = fractions.loc[0, "frac_dst_en_over_all"]
frac_src_en_all = fractions.loc[0, "frac_src_en_over_all"]

frac_dst_en_known = fractions.loc[0, "frac_dst_en_over_known"]
frac_src_en_known = fractions.loc[0, "frac_src_en_over_known"]

dst_cov = fractions.loc[0, "dst_lang_coverage"]
src_cov = fractions.loc[0, "src_lang_coverage"]

# ---- human-readable output ----
print(
    f"Among {n_primary:,} edges with primary = 1, "
    f"{frac_na:.2%} have lang = 'NA' (source and destination are classified as different languages)."
)

print(f"\nRestricting to those edges where lang = 'NA' " f"({n_na:,} edges):")

print(
    f"- {frac_dst_en_all:.2%} have destination language = 'en' "
    f"(lookup coverage: {dst_cov:.2%})."
)

print(f"- {frac_src_en_all:.2%} have source language = 'en' " f"(lookup coverage: {src_cov:.2%}).")

print(f"\nAmong edges with a known language in the lookup table:")

print(f"- {frac_dst_en_known:.2%} have destination language = 'en'.")

print(f"- {frac_src_en_known:.2%} have source language = 'en'.")

Among 652,371,564 edges with primary = 1, 16.68% have lang = 'NA' (source and destination are classified as different languages).

Restricting to those edges where lang = 'NA' (108,786,623 edges):
- 19.66% have destination language = 'en' (lookup coverage: 100.00%).
- 25.50% have source language = 'en' (lookup coverage: 100.00%).

Among edges with a known language in the lookup table:
- 19.66% have destination language = 'en'.
- 25.50% have source language = 'en'.
