In [None]:
from bs4 import BeautifulSoup
import jsonlines

WINDOW_SIZE = 5 

def parse_html(path):
    with open(path, encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    messages = []
    for msg in soup.find_all("div", class_="body"):
       
        if msg.find("div", class_="from_name"):
            name_tag = msg.find("div", class_="from_name")

        text_tag = msg.find("div", class_="text")
        if not text_tag:
            continue

        sender = name_tag.get_text(strip=True)
        text = text_tag.get_text(" ", strip=True)
        if not text: 
            continue

        role = "Me" if sender == "YesImBatMan" else "user"
        messages.append({"role": role, "content": text})
    return messages

def build_dataset(messages, window_size=WINDOW_SIZE):
    dataset = []
    for i in range(len(messages)):
        if messages[i]["role"] == "Me":
            context = messages[max(0, i-window_size):i]
            if not any(m["role"]=="user" for m in context):
                continue
            dataset.append({
                "messages": context + [messages[i]]
            })
    return dataset

if __name__ == "__main__":
    msgs = parse_html("messages.html")
    dataset = build_dataset(msgs, WINDOW_SIZE)

    with jsonlines.open("dataset.jsonl", "w") as writer:
        for d in dataset:
            writer.write(d)

    print("Saved", len(dataset), "samples in dataset.jsonl")


Saved 63 samples in dataset.jsonl


In [52]:
from bs4 import BeautifulSoup
import jsonlines
import re

WINDOW_SIZE = 5
ASSISTANT_NAME = "YesImBatMan"  # اسم تو در تلگرام

def parse_html(path):
    with open(path, encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    messages = []
    last_sender = None

    for m in soup.find_all("div", class_="message"):
        if "service" in m.get("class", []):
            continue

        body = m.find("div", class_="body")
        if not body:
            continue

        mid_raw = m.get("id", "")
        mid = int(re.sub(r"\D", "", mid_raw)) if mid_raw else None

        r = body.find("div", class_="reply_to")
        reply_to = None
        if r and r.find("a") and r.find("a").get("href"):
            href = r.find("a").get("href")
            ref = re.sub(r"\D", "", href or "")
            reply_to = int(ref) if ref else None

        name_tag = body.find("div", class_="from_name")
        if name_tag:
            last_sender = name_tag.get_text(strip=True)
        sender = last_sender or ""

        text_tag = body.find("div", class_="text")
        if not text_tag:
            continue
        text = text_tag.get_text(" ", strip=True)
        if not text:
            continue

        role = "assistant" if sender == ASSISTANT_NAME else "user"
        messages.append({
            "id": mid,
            "role": role,
            "sender": sender,
            "content": text,
            "reply_to": reply_to
        })

    return messages

def to_simple(m):
    return {"role": m["role"], "content": m["content"]}

def trim_leading_assistants(context):
    k = 0
    while k < len(context) and context[k]["role"] == "assistant":
        k += 1
    return context[k:]

def build_dataset(messages, window_size=WINDOW_SIZE, inject_style="raw", add_marker=True):
    """
    inject_style:
      - 'raw'   : اگر پیام مرجع خارج پنجره باشد، خودش را به ابتدای کانتکست تزریق می‌کند
      - 'quote' : اگر پیام مرجع خارج پنجره باشد، نسخه‌ی نقل‌قول‌شده‌اش را تزریق می‌کند
      - None    : هیچ تزریقی انجام نمی‌دهد
    add_marker:
      - True  : همیشه قبل از پاسخ یک مارکر ⟪REPLY_TO ...⟫ قرار می‌دهد (چه مرجع داخل پنجره باشد چه بیرون)
      - False : مارکر اضافه نمی‌شود
    """
    idx_by_id = {m["id"]: i for i, m in enumerate(messages) if m["id"] is not None}
    dataset = []

    for idx, msg in enumerate(messages):
        if msg["role"] != "assistant":
            continue

        start = max(0, idx - window_size)
        ctx_src = messages[start:idx]
        context = [to_simple(x) for x in ctx_src]

        # اگر پاسخ به پیام خاصی است:
        ref_id = msg.get("reply_to")
        replied = None
        if ref_id and ref_id in idx_by_id:
            j = idx_by_id[ref_id]
            replied = messages[j]

            # اگر مرجع خارج از پنجره است، طبق inject_style تزریقش کن
            if not (start <= j < idx):
                if inject_style == "raw":
                    context = [to_simple(replied)] + context
                elif inject_style == "quote":
                    q_sender = "من" if replied["role"] == "assistant" else (replied.get("sender") or "user")
                    q_text = f"> {q_sender}: {replied['content']}"
                    context = [{"role": "user", "content": q_text}] + context

        # مارکر REPLY_TO را همیشه قبل از پاسخ بگذار (اگر add_marker=True و replied موجود است)
        if add_marker and replied is not None:
            sender_label = "assistant" if replied["role"] == "assistant" else ("user")
            marker = {
                "role": "user",
                "content": f"⟪REPLY_TO {sender_label}⟫ {replied['content']} ⟪/REPLY_TO⟫"
            }
            context = context + [marker]

        # تمیزکاری: با user شروع شود و قبل از پاسخ حداقل یک user باشد
        context = trim_leading_assistants(context)
        if not context or not any(m["role"] == "user" for m in context):
            continue

        sample = {"messages": context + [to_simple(msg)]}
        dataset.append(sample)

    return dataset

if __name__ == "__main__":
    msgs = parse_html("messages.html")
    dataset = build_dataset(msgs, WINDOW_SIZE, inject_style="raw", add_marker=True)

    with jsonlines.open("dataset.jsonl", "w") as w:
        for d in dataset:
            w.write(d)

    print("Saved", len(dataset), "samples in dataset.jsonl")


Saved 63 samples in dataset.jsonl


In [54]:
import re
import json
import jsonlines

IN_PATH  = "dataset.jsonl"
OUT_PATH = "dataset_links_masked.jsonl"

# URLهای با پروتکل (http/https/ftp و هر scheme مثل vmess://, vless://, trojan://, ss:// و ...)
PROTO_URL_RE = re.compile(
    r"(?i)\b(?:[a-z][a-z0-9+.\-]*://)[^\s<>()]+"
)

# آدرس‌های بدون پروتکل که با www شروع می‌شوند
WWW_URL_RE = re.compile(
    r"(?i)\bwww\.[^\s<>()]+\.[a-z]{2,}(?:/[^\s<>()]*)?"
)

def mask_links(text: str) -> str:
    if not isinstance(text, str):
        return text
    t = PROTO_URL_RE.sub("[LINK]", text)
    t = WWW_URL_RE.sub("[LINK]", t)
    # کمی تمیزکاری فاصله‌ها
    t = re.sub(r"\s{2,}", " ", t).strip()
    return t

with jsonlines.open(OUT_PATH, "w") as w, open(IN_PATH, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        obj = json.loads(line)
        msgs = obj.get("messages", [])
        for m in msgs:
            if "content" in m:
                m["content"] = mask_links(m["content"])
        w.write({"messages": msgs})

print("Masked links →", OUT_PATH)


Masked links → dataset_links_masked.jsonl


In [1]:
# build_dataset_clean.py
from bs4 import BeautifulSoup
import jsonlines
import re
from typing import List, Dict, Optional

# ------------------------ تنظیمات ------------------------
WINDOW_SIZE = 5                 # چند پیام قبل از پاسخ را بیاور
ASSISTANT_NAME = "YesImBatMan"  # اسم اکانت خودت در تلگرام
INPUT_HTML = "messages.html"    # خروجی HTML تلگرام
OUTPUT_JSONL = "dataset.jsonl"  # خروجی دیتاست

# اگر خواستی لینک‌های تلگرام/URL را جایگزین کنی:
LINK_PLACEHOLDER = "«لینک»"     # None بگذار تا اصلاً جایگزینی نشود
# --------------------------------------------------------


def clean_text(s: str) -> str:
    if not s:
        return ""
    # حذف بلوک‌های REPLY_TO قدیمی اگر در متن آمده باشد
    s = re.sub(r"⟪REPLY_TO.*?⟫", "", s, flags=re.DOTALL)

    # جایگزینی [LINK] یا URLها (دلخواه)
    if LINK_PLACEHOLDER is not None:
        s = s.replace("[LINK]", LINK_PLACEHOLDER)
        # URL ساده:
        s = re.sub(r"https?://\S+", LINK_PLACEHOLDER, s)

    # فاصله‌ها و \n های اضافی
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()


def parse_html(path: str) -> List[Dict]:
    with open(path, encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    messages = []
    last_sender = None

    for m in soup.find_all("div", class_="message"):
        if "service" in m.get("class", []):
            continue

        body = m.find("div", class_="body")
        if not body:
            continue

        # id پیام
        mid_raw = m.get("id", "")
        mid = int(re.sub(r"\D", "", mid_raw)) if mid_raw else None

        # reply_to
        r = body.find("div", class_="reply_to")
        reply_to = None
        if r and r.find("a") and r.find("a").get("href"):
            href = r.find("a").get("href")
            ref = re.sub(r"\D", "", href or "")
            reply_to = int(ref) if ref else None

        # sender
        name_tag = body.find("div", class_="from_name")
        if name_tag:
            last_sender = name_tag.get_text(strip=True)
        sender = last_sender or ""

        # متن
        text_tag = body.find("div", class_="text")
        if not text_tag:
            continue
        raw = text_tag.get_text("\n", strip=True)  # \n برای پاراگراف‌های چندبخشی
        content = clean_text(raw)
        if not content:
            continue

        role = "assistant" if sender == ASSISTANT_NAME else "user"
        messages.append({
            "id": mid,
            "role": role,
            "sender": sender,
            "content": content,
            "reply_to": reply_to
        })

    return messages


def to_simple(m: Dict) -> Dict:
    return {"role": m["role"], "content": m["content"]}


def merge_same_role(seq: List[Dict]) -> List[Dict]:
    """ادغام پیام‌های پشت‌سرهم با نقش یکسان (content ها با \n چسبانده می‌شوند)."""
    out = []
    for m in seq:
        if not out:
            out.append(dict(m))
        elif out[-1]["role"] == m["role"]:
            out[-1]["content"] += "\n" + m["content"]
        else:
            out.append(dict(m))
    return out


def enforce_alternation(seq: List[Dict]) -> Optional[List[Dict]]:
    """یکی‌درمیون کردن نقش‌ها؛ با user شروع و با assistant پایان.
       اگر ممکن نبود، None برمی‌گرداند."""
    if not seq:
        return None

    # اگر با assistant شروع شده → یک user مینیمال جلویش بگذاریم
    if seq[0]["role"] == "assistant":
        seq = [{"role": "user", "content": "…"}] + seq

    # اگر با system شروع شده، ساده‌ترین: به user تبدیل کنیم (یا می‌شود حذف کرد)
    if seq[0]["role"] == "system":
        seq[0] = {"role": "user", "content": seq[0].get("content", "") or "…"}

    # ادغام مجدد
    seq = merge_same_role(seq)

    # اگر آخرین پیام user بود، حذفش کن (بدون پاسخ مانده)
    if seq and seq[-1]["role"] == "user":
        seq = seq[:-1]

    # حالا باید یکی‌درمیون باشد
    if not seq or len(seq) < 2:
        return None
    for i in range(1, len(seq)):
        if seq[i]["role"] == seq[i-1]["role"]:
            # اگر تکراری شد، ادغام و دوباره چک
            seq = merge_same_role(seq)
            # دوباره بررسی:
            ok = True
            for j in range(1, len(seq)):
                if seq[j]["role"] == seq[j-1]["role"]:
                    ok = False
                    break
            if not ok:
                return None
            break

    # باید با user شروع و با assistant پایان یابد
    if seq[0]["role"] != "user" or seq[-1]["role"] != "assistant":
        return None

    # حذف پیام‌های خالی
    seq = [m for m in seq if m["content"].strip()]
    return seq if len(seq) >= 2 else None


def build_dataset(messages: List[Dict], window_size: int = WINDOW_SIZE) -> List[Dict]:
    """
    برای هر پیام assistant یک نمونه می‌سازیم:
      context (حداکثر window_size پیام قبل) + همان پاسخ assistant
    اگر reply_to به پیامی خارج از پنجره اشاره داشت، یک «نقل‌قول کوتاه»
    به‌عنوان اولین پیام user داخل context اضافه می‌کنیم (بدون مارکر خاص).
    """
    # ایندکس سریع برای دسترسی به پیام‌ها با id
    idx_by_id = {m["id"]: i for i, m in enumerate(messages) if m["id"] is not None}
    out = []

    for idx, msg in enumerate(messages):
        if msg["role"] != "assistant":
            continue

        # کانتکست خام
        start = max(0, idx - window_size)
        context = [to_simple(x) for x in messages[start:idx]]

        # اگر reply_to دارد و بیرون از پنجره است، یک نقل‌قول کوتاه اضافه کن
        ref_id = msg.get("reply_to")
        if ref_id and ref_id in idx_by_id:
            j = idx_by_id[ref_id]
            if not (start <= j < idx):
                ref = messages[j]
                quote = f"> {('من' if ref['role']=='assistant' else ref.get('sender') or 'user')}: {ref['content']}"
                context = [{"role": "user", "content": quote}] + context

        # تمیزکاری: ادغام نقش‌های پشت‌سرهم و enforce alternation
        context = merge_same_role(context)
        sample_seq = context + [to_simple(msg)]
        sample_seq = enforce_alternation(sample_seq)
        if sample_seq is None:
            continue

        out.append({"messages": sample_seq})

    return out


if __name__ == "__main__":
    msgs = parse_html(INPUT_HTML)
    dataset = build_dataset(msgs, WINDOW_SIZE)

    with jsonlines.open(OUTPUT_JSONL, "w") as w:
        for d in dataset:
            w.write(d)

    print(f"Saved {len(dataset)} samples -> {OUTPUT_JSONL}")


Saved 66 samples -> dataset.jsonl


In [1]:
from bs4 import BeautifulSoup
import jsonlines, re, glob
from typing import List, Dict, Optional

# -------- تنظیمات --------
WINDOW_SIZE = 5
ASSISTANT_NAME = "YesImBatMan"
INPUT_HTML = "messages.html"
OUTPUT_JSONL = "dataset.jsonl"
LINK_PLACEHOLDER = "«لینک»"   # اگر نمی‌خوای لینک‌ها عوض بشن = None
# -------------------------

def clean_text(s: str) -> str:
    if not s: return ""
    s = re.sub(r"⟪REPLY_TO.*?⟫", "", s, flags=re.DOTALL)
    if LINK_PLACEHOLDER is not None:
        s = s.replace("[LINK]", LINK_PLACEHOLDER)
        s = re.sub(r"https?://\S+", LINK_PLACEHOLDER, s)
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()

def parse_html(path: str) -> List[Dict]:
    with open(path, encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    messages, last_sender = [], None
    for m in soup.find_all("div", class_="message"):
        if "service" in m.get("class", []): 
            continue
        body = m.find("div", class_="body")
        if not body: 
            continue

        mid_raw = m.get("id", "")
        mid = int(re.sub(r"\D", "", mid_raw)) if mid_raw else None

        r = body.find("div", class_="reply_to")
        reply_to = None
        if r and r.find("a") and r.find("a").get("href"):
            href = r.find("a").get("href") or ""
            ref = re.sub(r"\D", "", href)
            reply_to = int(ref) if ref else None

        name_tag = body.find("div", class_="from_name")
        if name_tag: 
            last_sender = name_tag.get_text(strip=True)
        sender = last_sender or ""

        text_tag = body.find("div", class_="text")
        if not text_tag: 
            continue
        content = clean_text(text_tag.get_text("\n", strip=True))
        if not content: 
            continue

        role = "assistant" if sender == ASSISTANT_NAME else "user"
        messages.append({"id": mid, "role": role, "sender": sender, "content": content, "reply_to": reply_to})
    return messages

def to_simple(m: Dict) -> Dict:
    return {"role": m["role"], "content": m["content"]}

def merge_same_role(seq: List[Dict]) -> List[Dict]:
    out = []
    for m in seq:
        if not out or out[-1]["role"] != m["role"]:
            out.append({"role": m["role"], "content": m["content"]})
        else:
            out[-1]["content"] += "\n" + m["content"]
    return out

def enforce_user_first_and_alternate(seq: List[Dict]) -> Optional[List[Dict]]:
    if not seq: 
        return None
    # نباید با assistant شروع شود و هیچ پیام ساختگی هم اضافه نمی‌کنیم
    if seq[0]["role"] != "user": 
        return None
    seq = merge_same_role(seq)
    # اگر با user تمام شد، آن نمونه را کنار بگذاریم (بدون پاسخ)
    if seq[-1]["role"] != "assistant":
        return None
    # یکی‌درمیان بودن نقش‌ها
    for i in range(1, len(seq)):
        if seq[i]["role"] == seq[i-1]["role"]:
            return None
    # محتوای خالی حذف
    seq = [m for m in seq if m["content"].strip()]
    return seq if len(seq) >= 2 else None

def build_dataset(messages: List[Dict], window_size: int = WINDOW_SIZE) -> List[Dict]:
    idx_by_id = {m["id"]: i for i, m in enumerate(messages) if m["id"] is not None}
    out = []

    for idx, msg in enumerate(messages):
        if msg["role"] != "assistant":
            continue

        # کانتکست پایه
        start = max(0, idx - window_size)

        # اگر کانتکست با assistant شروع می‌شود، کمی عقب‌گرد تا به user برسیم
        back = start
        while back > 0 and messages[back]["role"] == "assistant":
            back -= 1
        start = back

        context = [to_simple(x) for x in messages[start:idx]]

        # اگر reply_to بیرون پنجره است و «متن مرجع» وجود دارد، یک نقل‌قول کوتاهِ user اولِ کانتکست درج کن
        ref_id = msg.get("reply_to")
        if ref_id and ref_id in idx_by_id:
            j = idx_by_id[ref_id]
            if not (start <= j < idx):
                ref = messages[j]
                quote_user = "من" if ref["role"] == "assistant" else (ref.get("sender") or "user")
                quote = f"> {quote_user}: {ref['content']}"
                context = [{"role": "user", "content": quote}] + context

        # ادغام و قوانین
        context = merge_same_role(context)
        seq = context + [to_simple(msg)]
        seq = enforce_user_first_and_alternate(seq)
        if seq is None:
            continue

        out.append({"messages": seq})

    return out

# اجرا برای تمام فایل‌های HTML در این پوشه
html_files = sorted(glob.glob("*.html"))

total = 0
with jsonlines.open(OUTPUT_JSONL, "w") as w:
    for fp in html_files:
        msgs = parse_html(fp)
        dataset = build_dataset(msgs, WINDOW_SIZE)
        for d in dataset:
            w.write(d)
        print(f"{fp}: {len(dataset)} samples")
        total += len(dataset)

print(f"Saved {total} samples -> {OUTPUT_JSONL}")

messages (2).html: 626 samples
messages.html: 66 samples
messages2.html: 568 samples
messages2.html: 568 samples
messages3.html: 578 samples
messages3.html: 578 samples
messages4.html: 564 samples
messages4.html: 564 samples
messages5.html: 374 samples
Saved 2776 samples -> dataset.jsonl
messages5.html: 374 samples
Saved 2776 samples -> dataset.jsonl


In [42]:
from bs4 import BeautifulSoup
import jsonlines, re, glob
from typing import List, Dict, Optional

# -------- تنظیمات --------
WINDOW_SIZE = 15
ASSISTANT_NAME = "Omid"
INPUT_HTML = "messages.html"
OUTPUT_JSONL = "dataset.jsonl"
LINK_PLACEHOLDER = "«لینک»"   # اگر نمی‌خوای لینک‌ها عوض بشن = None
# -------------------------

def clean_text(s: str) -> str:
    if not s: return ""
    s = re.sub(r"⟪REPLY_TO.*?⟫", "", s, flags=re.DOTALL)
    if LINK_PLACEHOLDER is not None:
        s = s.replace("[LINK]", LINK_PLACEHOLDER)
        s = re.sub(r"https?://\S+", LINK_PLACEHOLDER, s)
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()

def parse_html(path: str) -> List[Dict]:
    with open(path, encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    messages, last_sender = [], None
    for m in soup.find_all("div", class_="message"):
        if "service" in m.get("class", []): 
            continue
        body = m.find("div", class_="body")
        if not body: 
            continue

        mid_raw = m.get("id", "")
        mid = int(re.sub(r"\D", "", mid_raw)) if mid_raw else None

        r = body.find("div", class_="reply_to")
        reply_to = None
        if r and r.find("a") and r.find("a").get("href"):
            href = r.find("a").get("href") or ""
            ref = re.sub(r"\D", "", href)
            reply_to = int(ref) if ref else None

        name_tag = body.find("div", class_="from_name")
        if name_tag: 
            last_sender = name_tag.get_text(strip=True)
        sender = last_sender or ""

        text_tag = body.find("div", class_="text")
        if not text_tag: 
            continue
        content = clean_text(text_tag.get_text("\n", strip=True))
        if not content: 
            continue

        role = "assistant" if sender == ASSISTANT_NAME else "user"
        messages.append({"id": mid, "role": role, "sender": sender, "content": content, "reply_to": reply_to})
    return messages

def to_simple(m: Dict) -> Dict:
    return {"role": m["role"], "content": m["content"]}

def merge_same_role(seq: List[Dict]) -> List[Dict]:
    out = []
    for m in seq:
        if not out or out[-1]["role"] != m["role"]:
            out.append({"role": m["role"], "content": m["content"]})
        else:
            out[-1]["content"] += "\n" + m["content"]
    return out

def enforce_user_first_and_alternate(seq: List[Dict]) -> Optional[List[Dict]]:
    if not seq: 
        return None
    # نباید با assistant شروع شود و هیچ پیام ساختگی هم اضافه نمی‌کنیم
    if seq[0]["role"] != "user": 
        return None
    seq = merge_same_role(seq)
    # اگر با user تمام شد، آن نمونه را کنار بگذاریم (بدون پاسخ)
    if seq[-1]["role"] != "assistant":
        return None
    # یکی‌درمیان بودن نقش‌ها
    for i in range(1, len(seq)):
        if seq[i]["role"] == seq[i-1]["role"]:
            return None
    # محتوای خالی حذف
    seq = [m for m in seq if m["content"].strip()]
    return seq if len(seq) >= 2 else None

def get_content(message):
    l = ""
    for i in message:
        if i["sender"] != ASSISTANT_NAME:
            l += (str("دوستم:" + i["content"]) + "\n")
        else:
            l += (str("من:" + i["content"]) + "\n")
    return l[:-1]


def build_dataset(messages: List[Dict], window_size: int = WINDOW_SIZE) -> List[Dict]:
    out = []
    #print(messages)
    for idx, msg in enumerate(messages):
        if msg["role"] != "assistant":
            continue

        
        out.append({"history":get_content(messages[max(0, idx - window_size):idx]), "response": msg["content"]})
        if idx==20:
            break
    return out

# اجرا برای تمام فایل‌های HTML در این پوشه
html_files = sorted(glob.glob("*.html"))

total = 0
with jsonlines.open(OUTPUT_JSONL, "w") as w:
    for fp in html_files:
        msgs = parse_html(fp)
        dataset = build_dataset(msgs, WINDOW_SIZE)
        for d in dataset:
            w.write(d)
        print(f"{fp}: {len(dataset)} samples")
        total += len(dataset)

print(f"Saved {total} samples -> {OUTPUT_JSONL}")

messages.html: 551 samples
messages10.html: 6 samples
messages10.html: 6 samples
messages100.html: 15 samples
messages100.html: 15 samples
messages101.html: 445 samples
messages101.html: 445 samples
messages102.html: 471 samples
messages102.html: 471 samples
messages103.html: 10 samples
messages103.html: 10 samples
messages104.html: 437 samples
messages104.html: 437 samples
messages105.html: 526 samples
messages105.html: 526 samples
messages106.html: 11 samples
messages106.html: 11 samples
messages107.html: 519 samples
messages107.html: 519 samples
messages108.html: 13 samples
messages108.html: 13 samples
messages109.html: 441 samples
messages109.html: 441 samples
messages11.html: 10 samples
messages11.html: 10 samples
messages110.html: 460 samples
messages110.html: 460 samples
messages111.html: 568 samples
messages111.html: 568 samples
messages112.html: 526 samples
messages112.html: 526 samples
messages113.html: 498 samples
messages113.html: 498 samples
messages114.html: 517 samples
m

In [2]:
# Install required packages for standard fine-tuning
%pip install transformers datasets accelerate peft trl torch
%pip install wandb

# Modules for fine-tuning
import torch # Import PyTorch
from transformers import (
	AutoModelForCausalLM, 
	AutoTokenizer, 
	TrainingArguments,
	Trainer,
	DataCollatorForLanguageModeling
)
from trl import SFTTrainer # Trainer for supervised fine-tuning (SFT)
from peft import LoraConfig, get_peft_model, TaskType
# Hugging Face modules
from huggingface_hub import login # Lets you login to API
from datasets import load_dataset # Lets you load fine-tuning datasets
# Import weights and biases
import wandb

# Check if CUDA is available
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
	print(f"GPU: {torch.cuda.get_device_name()}")
else:
	print("Running on CPU - fine-tuning will be slower")

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm
W0831 02:01:41.433000 25248 torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


CUDA available: False
Running on CPU - fine-tuning will be slower
