In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import AutoMinorLocator, MultipleLocator
from pathlib import Path
import os
import pandas as pd
import numpy as np

from dotenv import load_dotenv

from telegram_quality_control.kde import kde_fit
from telegram_quality_control.bootstrap import cluster_bootstrap_CI
from telegram_quality_control.visualization import single_col_figure, get_color_cycle

from cmcrameri import cm

figure_style = "print"

plt.style.use('./resources/mpl_styles/default.mplstyle')

load_dotenv(".env")

data_folder = Path(os.environ.get("OUTPUT_FOLDER"))

In [None]:
data_paths = {
    "telegram": data_folder / "matched_urls.parquet",
    "twitter": None,  # TODO: copy the table
}

url_df = pd.read_parquet(data_paths["telegram"])
url_df

In [None]:
num_reliable = len(url_df[url_df["reliability_updated"] >= 0.6])
total_urls = len(url_df[url_df["reliability_updated"].notnull()])

print(f"Fraction of unreliable URLs: {1 - num_reliable / total_urls}")

In [None]:
url_df = url_df.sample(frac=0.01)

In [None]:
url_df["domain"].value_counts(normalize=True)

In [None]:
kde = {}

for platform in ["telegram", "twitter"]:
    kde[platform] = {}
    save_path = data_folder / f"{platform}_url_reliability.npy"

    if save_path.exists():
        data = np.load(save_path)
    else:
        mask = url_df["reliability_updated"].notna()
        q_points, prob, _ = kde_fit(url_df[mask]["reliability_updated"], bandwidth=0.05)
        wrapper = lambda df: kde_fit(df["reliability_updated"], bandwidth=0.05)[1]
        lower, upper = cluster_bootstrap_CI(
            url_df[mask],
            wrapper,
            cluster_col="domain",
            num_bootstrap=1000,
            num_workers=24,
            quantile=0.95,
        )
        data = np.array([q_points, prob, lower, upper])
        np.save(save_path, data)

    kde[platform]["q_points"] = data[0, :]
    kde[platform]["prob"] = data[1, :]
    kde[platform]["lower"] = data[2, :]
    kde[platform]["upper"] = data[3, :]

In [None]:
fig = single_col_figure(0.8)
ax = fig.add_subplot(111)

colors = {"telegram": get_color_cycle()[0], "twitter": "gray"}
lines = {}
CI_bands = {}

for platform in ["twitter", "telegram"]:
    lines[platform] = ax.plot(
        kde[platform]["q_points"],
        kde[platform]["prob"],
        color=colors[platform],
        label=platform.capitalize(),
    )[0]

    CI_bands[platform] = ax.fill_between(
        kde[platform]["q_points"],
        kde[platform]["lower"],
        kde[platform]["upper"],
        alpha=0.2,
        label="95% CI",
        color=colors[platform],
        edgecolor=None,
    )

ax.set_xlim(0, 1)
ax.set_ylim(bottom=0)

# ax.set_title("Distribution of URLs by reliability")
ax.set_xlabel("Reliability")
ax.set_ylabel("P(reliability)")

ax.xaxis.set_minor_locator(MultipleLocator(0.1))
ax.set_yticks([0, 0.01, 0.02, 0.03, 0.04])
ax.yaxis.set_minor_locator(MultipleLocator(0.005))

ax.legend(handles=[lines["telegram"], lines["twitter"], CI_bands["twitter"]], framealpha=0.6)

fig.tight_layout()

fig.savefig(f"./figures/url_reliability.pdf", bbox_inches='tight')
fig.savefig(f"./figures/url_reliability.png", bbox_inches='tight')