# Telegram Bot — Data Preprocessing
This notebook loads a Telegram JSON export, flattens messages, filters junk, cleans text, exports a clean dataset, and optionally creates a balanced sample for manual labeling.

**Important:** Do **NOT** commit your raw Telegram data to GitHub. Use `.gitignore` to exclude `data/` and any private folders.

## Cell 1 — Imports + file path

In [None]:
import json
import re
from pathlib import Path

import pandas as pd

# ✅ change filename to your real export json name
JSON_PATH = Path("../data/raw/result.json")

assert JSON_PATH.exists(), f"File not found: {JSON_PATH.resolve()}"
print("Using:", JSON_PATH.resolve())

## Cell 2 — Load JSON + inspect structure

In [None]:
with JSON_PATH.open("r", encoding="utf-8") as f:
    data = json.load(f)

print("Top-level type:", type(data))

if isinstance(data, dict):
    print("Top-level keys (first 30):", list(data.keys())[:30])
elif isinstance(data, list):
    print("List length:", len(data))
    print("First element type:", type(data[0]) if len(data) else None)

## Cell 3 — Helper: Telegram `text` to string

In [None]:
def tg_text_to_str(text_field):
    """
    Telegram JSON export: 'text' can be a string OR a list of parts.
    This returns a single plain string.
    """
    if text_field is None:
        return ""
    if isinstance(text_field, str):
        return text_field
    if isinstance(text_field, list):
        out = []
        for part in text_field:
            if isinstance(part, str):
                out.append(part)
            elif isinstance(part, dict):
                out.append(part.get("text", ""))
        return "".join(out)
    return str(text_field)

## Cell 4 — Extract chats/messages blocks (supports common Telegram formats)
If you get `Chats found: 0`, run Cell 2 and share the keys you see; Telegram exports sometimes differ.

In [None]:
def extract_message_blocks(data):
    """
    Returns list of blocks: [{"chat_name": ..., "messages": [...]}, ...]
    Supports:
      1) {"name": ..., "messages":[...]}   (single chat)
      2) {"chats": {"list":[{name, messages}, ...]}}  (full export)
      3) {"messages":[...]} (fallback)
    """
    blocks = []

    # case 1: single chat export
    if isinstance(data, dict) and isinstance(data.get("messages"), list):
        blocks.append({
            "chat_name": data.get("name", data.get("title", "unknown_chat")),
            "messages": data["messages"]
        })
        return blocks

    # case 2: full export with chats.list
    if isinstance(data, dict):
        chats = data.get("chats")
        if isinstance(chats, dict):
            chat_list = chats.get("list")
            if isinstance(chat_list, list):
                for chat in chat_list:
                    if isinstance(chat, dict) and isinstance(chat.get("messages"), list):
                        blocks.append({
                            "chat_name": chat.get("name", chat.get("title", "unknown_chat")),
                            "messages": chat["messages"]
                        })
                if blocks:
                    return blocks

    # no match
    return blocks

blocks = extract_message_blocks(data)
print("Chats found:", len(blocks))
print("Example chat:", blocks[0]["chat_name"] if blocks else "NONE")

## Cell 5 — Flatten messages into DataFrame

In [None]:
rows = []

for block in blocks:
    chat_name = block["chat_name"]
    for m in block["messages"]:
        if not isinstance(m, dict):
            continue

        text_raw = tg_text_to_str(m.get("text"))

        rows.append({
            "chat_name": chat_name,
            "msg_id": m.get("id"),
            "date": m.get("date"),
            "from_name": m.get("from"),
            "from_id": m.get("from_id"),
            "type": m.get("type"),
            "text_raw": text_raw,
            "reply_to_msg_id": m.get("reply_to_message_id"),
        })

df = pd.DataFrame(rows)
print("Rows:", len(df))
df.head(5)

## Cell 6 — Parse date + basic checks

In [None]:
df["date"] = pd.to_datetime(df["date"], errors="coerce")

print("Columns:", df.columns.tolist())
print("\nType counts:\n", df["type"].value_counts(dropna=False).head(10))
print("\nSample texts:\n", df["text_raw"].dropna().astype(str).head(5).to_list())

# Step 3 — Filter junk (service + empty + too-short noise)
Keeps messages that are **not** service messages, have non-empty text, and have >=2 words (or contain '?').

## Cell 7 — Filter

In [None]:
df["text_raw"] = df["text_raw"].fillna("").astype(str)

df["is_empty_text"] = df["text_raw"].str.strip().eq("")
df["is_service"] = df["type"].astype(str).str.lower().eq("service")

# Keep only non-service and non-empty
df_keep = df[~df["is_service"] & ~df["is_empty_text"]].copy()

# Add basic lengths
df_keep["len_chars"] = df_keep["text_raw"].str.len()
df_keep["len_words"] = df_keep["text_raw"].str.split().apply(len)

# Stronger filter: keep messages with >=2 words OR question mark
df_keep = df_keep[(df_keep["len_words"] >= 2) | (df_keep["text_raw"].str.contains(r"\?", na=False))].copy()

print("Before:", len(df), "| After filter:", len(df_keep))
df_keep[["chat_name","from_name","text_raw"]].head(10)

# Step 4 — Clean text + features

## Cell 8 — Clean function

In [None]:
URL_RE = re.compile(r"""(?i)\b((?:https?://|www\.)\S+)\b""")
MENTION_RE = re.compile(r"(?<!\w)@\w+")
EMAIL_RE = re.compile(r"(?i)\b[\w\.-]+@[\w\.-]+\.\w+\b")
PHONE_RE = re.compile(r"(?<!\w)(?:\+?\d[\d\s\-().]{6,}\d)(?!\w)")
MULTISPACE_RE = re.compile(r"\s+")

def clean_text(s: str) -> str:
    s = str(s)
    s = s.replace("\n", " ").replace("\t", " ")
    s = s.lower()

    s = URL_RE.sub(" <URL> ", s)
    s = EMAIL_RE.sub(" <EMAIL> ", s)
    s = MENTION_RE.sub(" <USER> ", s)
    s = PHONE_RE.sub(" <PHONE> ", s)

    # optional: normalize standalone numbers
    s = re.sub(r"\b\d+\b", "<NUM>", s)

    s = MULTISPACE_RE.sub(" ", s).strip()
    return s

## Cell 9 — Apply cleaning + remove empty after cleaning

In [None]:
df_work = df_keep.copy()

df_work["text_clean"] = df_work["text_raw"].apply(clean_text)

df_work["clean_len_chars"] = df_work["text_clean"].str.len()
df_work["clean_len_words"] = df_work["text_clean"].str.split().apply(len)

df_work = df_work[(df_work["clean_len_words"] >= 2) | (df_work["text_clean"].str.contains(r"\?", na=False))].copy()

print("Rows after cleaning:", len(df_work))
df_work[["text_raw","text_clean"]].head(10)

## Cell 10 — Add simple features (useful later)

In [None]:
df_work["has_url"] = df_work["text_clean"].str.contains(r"<URL>", regex=True)
df_work["has_mention"] = df_work["text_clean"].str.contains(r"<USER>", regex=True)

df_work["is_question"] = (
    df_work["text_raw"].str.contains(r"\?", na=False) |
    df_work["text_clean"].str.startswith(("why ", "how ", "what ", "when ", "where "))
)

df_work["hour"] = df_work["date"].dt.hour
df_work["weekday"] = df_work["date"].dt.weekday

# Optional dedup
before = len(df_work)
df_work = df_work.drop_duplicates(subset=["chat_name","from_id","text_clean"])
print("Dedup removed:", before - len(df_work))

# Step 5 — Export clean dataset

## Cell 11 — Export `clean_messages.csv`

In [None]:
OUT_DIR = Path("../data/final")
OUT_DIR.mkdir(parents=True, exist_ok=True)

KEEP_COLS = [
    "chat_name","msg_id","date","from_name","from_id","type",
    "text_raw","text_clean",
    "len_chars","len_words","clean_len_chars","clean_len_words",
    "has_url","has_mention","is_question","hour","weekday"
]

keep_existing = [c for c in KEEP_COLS if c in df_work.columns]
df_export = df_work[keep_existing].copy()

clean_path = OUT_DIR / "clean_messages.csv"
df_export.to_csv(clean_path, index=False, encoding="utf-8")

print("Saved:", clean_path.resolve(), "| rows:", len(df_export))
df_export.head(5)

# Step 5.1 — Priority scoring (optional but recommended)

## Cell 12 — Set VIP/Groups + priority score
Start with empty sets, run Cell 14 to see top senders/chats, then fill `VIP_PEOPLE`, `IMPORTANT_GROUPS`, `LOW_PRIORITY_GROUPS`.

In [None]:
# EDIT THESE LATER (start empty, then fill after you inspect top senders/chats)
VIP_PEOPLE = set()
IMPORTANT_GROUPS = set()
LOW_PRIORITY_GROUPS = set()

df_base = df_export.copy()
df_base["from_name"] = df_base["from_name"].fillna("")
df_base["chat_name"] = df_base["chat_name"].fillna("")
df_base["text_clean"] = df_base["text_clean"].fillna("")

# simple group heuristic (we can improve later)
df_base["is_group"] = df_base["chat_name"].str.contains(r"(group|чат|канал|channel)", case=False, regex=True)

URGENT_RE = re.compile(r"\b(urgent|asap|today|tomorrow|deadline|exam|please|call|help)\b", re.I)
df_base["has_urgent_words"] = df_base["text_clean"].str.contains(URGENT_RE)
df_base["is_question_like"] = df_base["text_clean"].str.contains(r"\?") | df_base["text_clean"].str.startswith(("why ","how ","what ","when ","where "))

df_base["is_vip_person"] = df_base["from_name"].isin(VIP_PEOPLE)
df_base["is_important_group"] = df_base["chat_name"].isin(IMPORTANT_GROUPS)
df_base["is_lowprio_group"] = df_base["chat_name"].isin(LOW_PRIORITY_GROUPS)

df_base["priority_score"] = (
    5 * df_base["is_vip_person"].astype(int)
    + 3 * df_base["is_important_group"].astype(int)
    - 2 * df_base["is_lowprio_group"].astype(int)
    + 2 * df_base["is_question_like"].astype(int)
    + 2 * df_base["has_urgent_words"].astype(int)
    - 1 * df_base["is_group"].astype(int)
)

df_base[["priority_score","chat_name","from_name","text_clean"]].head(10)

# Step 5.2 — Create balanced labeling file

## Cell 13 — Balanced sample → `to_label_balanced_1000.csv`

In [None]:
def sample_quota(df, mask, n, seed=42):
    pool = df[mask].copy()
    if len(pool) == 0:
        return pool
    return pool.sample(min(n, len(pool)), random_state=seed)

N = 1000

vip_dm      = sample_quota(df_base, df_base["is_vip_person"] & ~df_base["is_group"], 400)
normal_dm   = sample_quota(df_base, ~df_base["is_vip_person"] & ~df_base["is_group"], 250)
imp_groups  = sample_quota(df_base, df_base["is_important_group"], 250)
noise_group = sample_quota(df_base, df_base["is_group"] & ~df_base["is_important_group"], 100)

to_label = pd.concat([vip_dm, normal_dm, imp_groups, noise_group], ignore_index=True)

# fallback if your VIP/GROUP lists are empty (very likely now)
if len(to_label) < N:
    remaining = N - len(to_label)
    extra = df_base.sort_values("priority_score", ascending=False).head(remaining)
    to_label = pd.concat([to_label, extra], ignore_index=True)

to_label = to_label.drop_duplicates(subset=["chat_name","from_id","text_clean"])
to_label = to_label.sample(min(N, len(to_label)), random_state=42).copy()

to_label["label"] = ""  # you fill: important / normal / ignore

out_path = OUT_DIR / "to_label_balanced_1000.csv"
to_label.to_csv(out_path, index=False, encoding="utf-8")

print("Saved:", out_path.resolve(), "| rows:", len(to_label))
to_label[["priority_score","chat_name","from_name","text_clean","label"]].head(10)

# Helper — Choose VIPs and groups

## Cell 14 — Show top senders + top chats

In [None]:
print("Top chats:")
display(df_export["chat_name"].value_counts().head(25))

print("\nTop senders:")
display(df_export["from_name"].value_counts().head(25))

# Next steps
1. Open `data/final/to_label_balanced_1000.csv`
2. Fill `label` with: `important` / `normal` / `ignore`
3. Then continue with Step 6: train/val/test split + baseline TF‑IDF model.