# TG Bot — Preprocessing (Personal Chats Only)
`Goal:` turn Telegram JSON export into a clean dataset for manual labeling and model training

**Outputs**:
- `clean_messages.csv` — clean messages from personal chats only
- `to_label_personal_balanced_1000.csv` — balanced sample for manual labeling

## 1) Load JSON

In [None]:
import json
import re
from pathlib import Path

import pandas as pd

JSON_PATH = Path("../data/personal_chats.json")

assert JSON_PATH.exists(), f"File not found: {JSON_PATH.resolve()}"
print("Using:", JSON_PATH.resolve())

with JSON_PATH.open("r", encoding="utf-8") as f:
    data = json.load(f)

print("Top-level type:", type(data))
if isinstance(data, dict):
    print("Top-level keys (first 30):", list(data.keys())[:30])

## 2) Helpers: extract chats + normalize Telegram `text`

In [None]:
def tg_text_to_str(text_field):
    if text_field is None:
        return ""
    if isinstance(text_field, str):
        return text_field
    if isinstance(text_field, list):
        out = []
        for part in text_field:
            if isinstance(part, str):
                out.append(part)
            elif isinstance(part, dict):
                out.append(part.get("text", ""))
        return "".join(out)
    return str(text_field)


def extract_message_blocks(data):
    blocks = []

    if isinstance(data, dict) and isinstance(data.get("messages"), list):
        blocks.append({
            "chat_name": data.get("name", data.get("title", "unknown_chat")),
            "chat_type": data.get("type", None),
            "messages": data["messages"],
        })
        return blocks

    if isinstance(data, dict):
        chats = data.get("chats")
        if isinstance(chats, dict):
            chat_list = chats.get("list")
            if isinstance(chat_list, list):
                for chat in chat_list:
                    if isinstance(chat, dict) and isinstance(chat.get("messages"), list):
                        blocks.append({
                            "chat_name": chat.get("name", chat.get("title", "unknown_chat")),
                            "chat_type": chat.get("type", None),
                            "messages": chat["messages"],
                        })
                if blocks:
                    return blocks

    return blocks


blocks = extract_message_blocks(data)
print("Chats found:", len(blocks))
print("Example chat:", blocks[0]["chat_name"] if blocks else "NONE")
print("Example type:", blocks[0].get("chat_type") if blocks else None)

## 3) Flatten to DataFrame (minimal fields)
We keep only the fields we need for cleaning + labeling.

In [None]:
rows = []

for block in blocks:
    chat_name = block["chat_name"]
    chat_type = block.get("chat_type")

    for m in block["messages"]:
        if not isinstance(m, dict):
            continue

        rows.append({
            "chat_name": chat_name,
            "chat_type": chat_type,
            "msg_id": m.get("id"),
            "date": m.get("date"),
            "from_name": m.get("from"),
            "from_id": m.get("from_id"),
            "type": m.get("type"),
            "text_raw": tg_text_to_str(m.get("text")),
        })

df = pd.DataFrame(rows)
df["date"] = pd.to_datetime(df["date"], errors="coerce")

print("Rows:", len(df))
print("Columns:", df.columns.tolist())
df.head(5)

## 4) Filter to personal chats only

Telegram exports differ, so we use a **robust heuristic**:
1) If `chat_type` exists and looks like a private/personal chat → keep
2) Otherwise, infer personal chats by **number of unique senders per chat** ≤ 2 usually means you + one person
3) Exclude obvious groups/channels by name patterns

This is conservative: it may drop a few true DMs, but it avoids group noise

In [None]:
df2 = df.copy()

# Basic clean
df2["chat_name"] = df2["chat_name"].fillna("").astype(str)
df2["from_id"] = df2["from_id"].fillna("").astype(str)
df2["from_name"] = df2["from_name"].fillna("").astype(str)
df2["type"] = df2["type"].fillna("").astype(str)
df2["text_raw"] = df2["text_raw"].fillna("").astype(str)

# Remove service + empty early so sender-count heuristic works on real messages
df2["is_service"] = df2["type"].str.lower().eq("service")
df2["is_empty_text"] = df2["text_raw"].str.strip().eq("")
df2 = df2[~df2["is_service"] & ~df2["is_empty_text"]].copy()

# Exclude obvious groups/channels by name patterns edit if needed
GROUP_NAME_RE = re.compile(r"(\||\bgroup\b|\bchannel\b|\bканал\b|\bчат\b|\bобъявлен|announcement|broadcast)", re.I)
df2["name_looks_group"] = df2["chat_name"].str.contains(GROUP_NAME_RE)

# If chat_type exists, use it as a hint
# Common values seen in exports 'personal_chat', 'private', 'public_group', 'private_group', 'channel'
df2["chat_type_norm"] = df2["chat_type"].fillna("").astype(str).str.lower()
df2["type_looks_personal"] = df2["chat_type_norm"].str.contains(r"(personal|private)", regex=True) & ~df2["chat_type_norm"].str.contains(r"(group|channel)", regex=True)

# Sender-count heuristic personal chat usually has <= 2 distinct from_id
sender_counts = df2.groupby("chat_name")["from_id"].nunique().rename("n_unique_senders")
df2 = df2.merge(sender_counts, on="chat_name", how="left")
df2["looks_personal_by_senders"] = df2["n_unique_senders"].fillna(999).astype(int) <= 2

# Final keep rule:
# - keep if chat_type says personal OR sender heuristic says personal
# - AND chat_name does NOT look like group
df_personal = df2[(df2["type_looks_personal"] | df2["looks_personal_by_senders"]) & (~df2["name_looks_group"])].copy()

print("After personal-chat filter:", len(df_personal))
print("Unique chats kept:", df_personal["chat_name"].nunique())

df_personal["chat_name"].value_counts().head(15)

## 5) Clean text (light cleaning) + minimal features
We keep cleaning **simple** so you don't destroy meaning.
Tokens like `<URL>` and `<USER>` help the model learn patterns without memorizing specifics.

In [None]:
URL_RE = re.compile(r"""(?i)\b((?:https?://|www\.)\S+)\b""")
MENTION_RE = re.compile(r"(?<!\w)@\w+")
MULTISPACE_RE = re.compile(r"\s+")

def clean_text(s: str) -> str:
    s = str(s).replace("\n", " ").replace("\t", " ").lower()
    s = URL_RE.sub(" <URL> ", s)
    s = MENTION_RE.sub(" <USER> ", s)
    s = MULTISPACE_RE.sub(" ", s).strip()
    return s

df_personal["text_clean"] = df_personal["text_raw"].apply(clean_text)

# Minimal features good for baseline + later rules 
df_personal["len_words"] = df_personal["text_clean"].str.split().apply(len)
df_personal["has_url"] = df_personal["text_clean"].str.contains(r"<URL>", regex=True)
df_personal["is_question"] = df_personal["text_raw"].str.contains(r"\?", na=False) | df_personal["text_clean"].str.startswith(("why ", "how ", "what ", "when ", "where "))

# Remove ultra short noise tweak threshold if you want 
df_personal = df_personal[(df_personal["len_words"] >= 2) | (df_personal["is_question"])].copy()

# Deduplicate within same chat+sender+text
before = len(df_personal)
df_personal = df_personal.drop_duplicates(subset=["chat_name", "from_id", "text_clean"])
print("Dedup removed:", before - len(df_personal))
print("Rows now:", len(df_personal))

df_personal[["chat_name","from_name","text_clean","len_words","is_question"]].head(10)

## 6) Export clean personal dataset

In [None]:
OUT_DIR = Path("../data/final")
OUT_DIR.mkdir(parents=True, exist_ok=True)

keep_cols = [
    "chat_name","msg_id","date","from_name","from_id",
    "text_raw","text_clean",
    "len_words","has_url","is_question"
]

df_clean = df_personal[keep_cols].copy()

clean_path = OUT_DIR / "personal_clean_messages.csv"
df_clean.to_csv(clean_path, index=False, encoding="utf-8")

print("Saved:", clean_path.resolve())
print("Rows :", len(df_clean))
df_clean.head(5)

## 7) Balanced sampling for manual labeling by sender
Goal: your labeling file should not be dominated by 1–2 people.

Strategy:
- Pick top senders (in personal chats) by message count.
- Sample **up to `MAX_PER_SENDER`** messages per sender.
- Ensure enough diversity.

Output: `to_label_personal_balanced_1000.csv`

In [None]:
top_senders = df_clean["from_name"].value_counts().head(30)
top_senders

In [None]:
N_TOTAL = 1000
MAX_PER_SENDER = 80      # cap so one person does not dominate
MIN_PER_SENDER = 15      # try to get at least this many from a sender if they have enough

df_lab = df_clean.copy()
df_lab["from_name"] = df_lab["from_name"].fillna("").astype(str)

# Order senders by frequency
sender_order = df_lab["from_name"].value_counts().index.tolist()

samples = []
remaining = N_TOTAL

rng = 42

for sender in sender_order:
    if remaining <= 0:
        break
    pool = df_lab[df_lab["from_name"] == sender]
    if len(pool) == 0:
        continue

    # How many to take from this sender
    take = min(MAX_PER_SENDER, len(pool))
    # If we still have lots remaining, try to take at least MIN_PER_SENDER
    if remaining > 0:
        take = min(take, remaining)
        if take < MIN_PER_SENDER and len(pool) >= MIN_PER_SENDER and remaining >= MIN_PER_SENDER:
            take = MIN_PER_SENDER

    samples.append(pool.sample(take, random_state=rng))
    remaining -= take

to_label = pd.concat(samples, ignore_index=True)

# If still not enough rare, top up with random from whole pool
if len(to_label) < N_TOTAL:
    need = N_TOTAL - len(to_label)
    extra = df_lab.drop(to_label.index, errors="ignore")
    if len(extra) > 0:
        to_label = pd.concat([to_label, extra.sample(min(need, len(extra)), random_state=99)], ignore_index=True)

# Final shuffle + label column
to_label = to_label.drop_duplicates(subset=["chat_name","from_id","text_clean"])
to_label = to_label.sample(min(N_TOTAL, len(to_label)), random_state=123).copy()
to_label["label"] = "" 

out_path = OUT_DIR / "to_label_personal_balanced_1000.csv"
to_label.to_csv(out_path, index=False, encoding="utf-8")

print("Saved:", out_path.resolve())
print("Rows :", len(to_label))
print("\nPer-sender counts (top 15):")
print(to_label["from_name"].value_counts().head(15))

to_label.head(10)

### 8) Some preprocessing steps for dataset labeled_1000_samples_manually.csv

In [None]:
import pandas as pd
import numpy as np

PATH = "../data/final/labeled_1000_samples_manually.csv"  

df = pd.read_csv(PATH)

if "label" not in df.columns:
    df["label"] = ""

def normalize_label(x):
    if pd.isna(x):
        return "-"
    s = str(x).strip()

    if s == "":
        return "-"

    # keep only + and - characters
    s = "".join(ch for ch in s if ch in "+-")

    if s == "":
        return "-"

    return "+" if "+" in s else "-"

df["label"] = df["label"].apply(normalize_label)

print(df["label"].value_counts(dropna=False)) 

In [None]:
df.info()

In [None]:
df_ml = df.copy() 

# label: + / - => 1 / 0 
df_ml["label"] = df_ml["label"].map({"+": 1, "-": 0})

# booleans => int
df_ml["is_question"] = df_ml["is_question"].astype(int)

# select only what the model needs
FEATURE_COLS = ["text_clean", "len_words", "is_question", "label"]
df_ml = df_ml[FEATURE_COLS]

df_ml.info()
df_ml.head()  

In [None]:
from pathlib import Path

OUT_DIR = Path("../data/final")
OUT_DIR.mkdir(parents=True, exist_ok=True)

out_path = OUT_DIR / "labeled_1000_samples.csv"

df_ml.to_csv(out_path, index=False, encoding="utf-8")

print("Saved:", out_path.resolve())
df_ml.head() 

### 9) Save final labeled_1000_samples

In [None]:
df_ml.info() 