# Visualizations for the "What's in my big data" QC workflow.


In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import AutoMinorLocator, MultipleLocator
from pathlib import Path
import os
import pandas as pd
import numpy as np

from dotenv import load_dotenv

from telegram_quality_control.visualization import (
    single_col_figure,
    double_col_figure,
    get_color_cycle,
)

from cmcrameri import cm

figure_style = "print"

plt.style.use('./resources/mpl_styles/default.mplstyle')

load_dotenv(".env")

data_folder = Path(os.environ.get("OUTPUT_FOLDER"))

## Length distribution


In [None]:
length_distribution = pd.read_csv(data_folder / "message_length.csv")

fig = single_col_figure(0.6)
ax = fig.add_subplot(1, 1, 1)

suspicious_length = [28, 80, 288, 1024]

explanations = {
    28: "YouTube links (like https://youtu.be/abcdefghijk)",
    63: "System message",
    80: "System message",
    288: "Ad message in Arabic",
    1024: "Caption length limit",
}

# plot a histogram of the message lengths
ax.step(length_distribution["length"], length_distribution["count"], where="mid", linewidth=1)

ax.set_xlim(1, 4096)
ax.set_ylim(0, 6e7)
ax.set_xscale("log")

# Define different markers for each suspicious length
markers = ['*', 's', 'D', '^', 'P', 'H']  # star, square, diamond, triangle, plus, hexagon

colors = get_color_cycle()

# Add arrows and markers for each suspicious length
for i, length in enumerate(suspicious_length):
    # Find the count at this length
    matching_row = length_distribution[length_distribution['length'] == length]
    if not matching_row.empty:
        count = matching_row.iloc[0]['count']

        # Add downward-pointing arrow with marker above it
        ax.annotate(
            '',
            xy=(length, count),
            xytext=(length, count + 1e7),
            arrowprops=dict(arrowstyle='->', lw=1, color=colors[i + 1]),
        )

        # Add marker above the arrow
        ax.plot(
            length,
            count + 1.3e7,
            color=colors[i + 1],
            marker=markers[i],
            markersize=5,
        )

ax.set_xlabel("Message length (characters)")
ax.set_ylabel("Number of messages")

# fig.tight_layout()
plt.subplots_adjust(left=0.15, bottom=0.25, top=0.9, right=0.98)

fig.savefig(data_folder / f"message_length_dist.pdf")
fig.savefig(data_folder / f"message_length_dist.png")

In [None]:
system_messages = [
    "This message couldn't be displayed on your device due to copyright infringement.",
    "This channel can’t be displayed because it violated Telegram's Terms of Service.",
    "This channel can’t be displayed because it violated local laws.",
]

for message in system_messages:
    print(len(message))

In [None]:
sql = """
SELECT text
FROM message_content TABLESAMPLE BERNOULLI (1)
WHERE LENGTH(text) = 288
limit 1000;
"""

from telegram_quality_control.db import get_conn_string

db_url = get_conn_string()

messages = pd.read_sql_query(sql, db_url)

In [None]:
messages["text"].value_counts()

In [None]:
spam_text = "⚠️︙عذراً، عليك الانضمام الى هذهِ القناة أولاً،"

In [None]:
sql = f"""
SELECT c.id
from messages m
JOIN message_content mc ON m.id = mc.message_id
JOIN chats c ON m.chat_id = c.id
WHERE mc.text LIKE '{spam_text}%%'
LIMIT 100;
"""

chat_ids = pd.read_sql_query(sql, db_url)

In [None]:
chat_ids["id"].value_counts()