In [None]:
from bs4 import BeautifulSoup
import jsonlines

WINDOW_SIZE = 5 

def parse_html(path):
    with open(path, encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    messages = []
    for msg in soup.find_all("div", class_="body"):
       
        if msg.find("div", class_="from_name"):
            name_tag = msg.find("div", class_="from_name")

        text_tag = msg.find("div", class_="text")
        if not text_tag:
            continue

        sender = name_tag.get_text(strip=True)
        text = text_tag.get_text(" ", strip=True)
        if not text: 
            continue

        role = "Me" if sender == "YesImBatMan" else "user"
        messages.append({"role": role, "content": text})
    return messages

def build_dataset(messages, window_size=WINDOW_SIZE):
    dataset = []
    for i in range(len(messages)):
        if messages[i]["role"] == "Me":
            context = messages[max(0, i-window_size):i]
            if not any(m["role"]=="user" for m in context):
                continue
            dataset.append({
                "messages": context + [messages[i]]
            })
    return dataset

if __name__ == "__main__":
    msgs = parse_html("messages.html")
    dataset = build_dataset(msgs, WINDOW_SIZE)

    with jsonlines.open("dataset.jsonl", "w") as writer:
        for d in dataset:
            writer.write(d)

    print("Saved", len(dataset), "samples in dataset.jsonl")


Saved 63 samples in dataset.jsonl


In [52]:
from bs4 import BeautifulSoup
import jsonlines
import re

WINDOW_SIZE = 5
ASSISTANT_NAME = "YesImBatMan"  # اسم تو در تلگرام

def parse_html(path):
    with open(path, encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    messages = []
    last_sender = None

    for m in soup.find_all("div", class_="message"):
        if "service" in m.get("class", []):
            continue

        body = m.find("div", class_="body")
        if not body:
            continue

        mid_raw = m.get("id", "")
        mid = int(re.sub(r"\D", "", mid_raw)) if mid_raw else None

        r = body.find("div", class_="reply_to")
        reply_to = None
        if r and r.find("a") and r.find("a").get("href"):
            href = r.find("a").get("href")
            ref = re.sub(r"\D", "", href or "")
            reply_to = int(ref) if ref else None

        name_tag = body.find("div", class_="from_name")
        if name_tag:
            last_sender = name_tag.get_text(strip=True)
        sender = last_sender or ""

        text_tag = body.find("div", class_="text")
        if not text_tag:
            continue
        text = text_tag.get_text(" ", strip=True)
        if not text:
            continue

        role = "assistant" if sender == ASSISTANT_NAME else "user"
        messages.append({
            "id": mid,
            "role": role,
            "sender": sender,
            "content": text,
            "reply_to": reply_to
        })

    return messages

def to_simple(m):
    return {"role": m["role"], "content": m["content"]}

def trim_leading_assistants(context):
    k = 0
    while k < len(context) and context[k]["role"] == "assistant":
        k += 1
    return context[k:]

def build_dataset(messages, window_size=WINDOW_SIZE, inject_style="raw", add_marker=True):
    """
    inject_style:
      - 'raw'   : اگر پیام مرجع خارج پنجره باشد، خودش را به ابتدای کانتکست تزریق می‌کند
      - 'quote' : اگر پیام مرجع خارج پنجره باشد، نسخه‌ی نقل‌قول‌شده‌اش را تزریق می‌کند
      - None    : هیچ تزریقی انجام نمی‌دهد
    add_marker:
      - True  : همیشه قبل از پاسخ یک مارکر ⟪REPLY_TO ...⟫ قرار می‌دهد (چه مرجع داخل پنجره باشد چه بیرون)
      - False : مارکر اضافه نمی‌شود
    """
    idx_by_id = {m["id"]: i for i, m in enumerate(messages) if m["id"] is not None}
    dataset = []

    for idx, msg in enumerate(messages):
        if msg["role"] != "assistant":
            continue

        start = max(0, idx - window_size)
        ctx_src = messages[start:idx]
        context = [to_simple(x) for x in ctx_src]

        # اگر پاسخ به پیام خاصی است:
        ref_id = msg.get("reply_to")
        replied = None
        if ref_id and ref_id in idx_by_id:
            j = idx_by_id[ref_id]
            replied = messages[j]

            # اگر مرجع خارج از پنجره است، طبق inject_style تزریقش کن
            if not (start <= j < idx):
                if inject_style == "raw":
                    context = [to_simple(replied)] + context
                elif inject_style == "quote":
                    q_sender = "من" if replied["role"] == "assistant" else (replied.get("sender") or "user")
                    q_text = f"> {q_sender}: {replied['content']}"
                    context = [{"role": "user", "content": q_text}] + context

        # مارکر REPLY_TO را همیشه قبل از پاسخ بگذار (اگر add_marker=True و replied موجود است)
        if add_marker and replied is not None:
            sender_label = "assistant" if replied["role"] == "assistant" else ("user")
            marker = {
                "role": "user",
                "content": f"⟪REPLY_TO {sender_label}⟫ {replied['content']} ⟪/REPLY_TO⟫"
            }
            context = context + [marker]

        # تمیزکاری: با user شروع شود و قبل از پاسخ حداقل یک user باشد
        context = trim_leading_assistants(context)
        if not context or not any(m["role"] == "user" for m in context):
            continue

        sample = {"messages": context + [to_simple(msg)]}
        dataset.append(sample)

    return dataset

if __name__ == "__main__":
    msgs = parse_html("messages.html")
    dataset = build_dataset(msgs, WINDOW_SIZE, inject_style="raw", add_marker=True)

    with jsonlines.open("dataset.jsonl", "w") as w:
        for d in dataset:
            w.write(d)

    print("Saved", len(dataset), "samples in dataset.jsonl")


Saved 63 samples in dataset.jsonl
