In [1]:
import os, platform, pathlib, torch, random, numpy as np

# --- Windows-safe env flags ---
if platform.system() == "Windows":
    os.environ["HF_DATASETS_DISABLE_MULTIPROCESSING"] = "1"  # без сабпроцессов в datasets
    os.environ["TOKENIZERS_PARALLELISM"] = "false"           # без параллелизма токенайзера
    os.environ["UNSLOTH_COMPILE_DISABLE"] = "1"              # <--- ОТКЛЮЧИТЬ torch.compile в Unsloth
    os.environ["TORCHINDUCTOR_CACHE_DIR"] = r"C:\ti_cache"   # короткий путь для inductor
    os.environ["TRITON_CACHE_DIR"]       = r"C:\triton_cache"
    pathlib.Path(r"C:\ti_cache").mkdir(parents=True, exist_ok=True)
    pathlib.Path(r"C:\triton_cache").mkdir(parents=True, exist_ok=True)

# Paths (we'll pick the first that exists)
CANDIDATE_CSVS = [
    "/mnt/data/train_raw.csv",
    "../datasets/train_raw.csv",
    "train_raw.csv",
]
DATA_CSV = next((p for p in CANDIDATE_CSVS if pathlib.Path(p).exists()), CANDIDATE_CSVS[-1])

# Output
OUTPUT_DIR = "gptoss20b_ner_bio_unsloth"
TRAIN_JSONL = "train.jsonl"
VAL_JSONL   = "val.jsonl"

# Training knobs
VAL_FRAC = 0.10
MAX_LEN  = 1024     # если будет OOM, снизь до 768/512
RANK     = 8
PACKING  = False    # на Win сначала False
GRAD_ACC = 16
FORCE_REGEN = False

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print("DATA_CSV =", DATA_CSV)
print("bf16 supported:", torch.cuda.is_bf16_supported())



DATA_CSV = ../datasets/train_raw.csv
bf16 supported: True


In [2]:
import ast, json, math
import pandas as pd
from typing import List, Tuple

def parse_ann_cell(cell) -> List[Tuple[int,int,str]]:
    """annotation: list[(start, end, tag)] — safe parse"""
    if isinstance(cell, (list, tuple)):
        ann = cell
    else:
        ann = ast.literal_eval(str(cell))
    out = []
    for t in ann:
        if not (isinstance(t, (list, tuple)) and len(t) == 3):
            raise ValueError(f"Not a 3-tuple: {t}")
        s, e, tag = t
        out.append((int(s), int(e), str(tag)))
    return out

def word_spans_exact_spaces(text: str) -> List[Tuple[int,int]]:
    """Split by the exact space ' ' only (keeps indices)."""
    spans = []
    i, n = 0, len(text)
    while i < n:
        while i < n and text[i] == ' ':
            i += 1
        if i >= n: break
        j = i
        while j < n and text[j] != ' ':
            j += 1
        spans.append((i, j))  # [i, j)
        i = j
    return spans

def make_jsonl_if_needed(csv_path: str, train_path: str, val_path: str, val_frac: float, force: bool=False):
    if (not force) and pathlib.Path(train_path).exists() and pathlib.Path(val_path).exists():
        print("JSONL already exist; skipping regen. Set FORCE_REGEN=True to rebuild.")
        return

    print("Reading CSV:", csv_path)
    df = pd.read_csv(csv_path, sep=None, engine="python").dropna(subset=["sample","annotation"]).reset_index(drop=True)
    df.columns = df.columns.str.strip()

    rows = []
    bad_parse = 0
    bad_len_or_span = 0

    for _, r in df.iterrows():
        text = str(r["sample"])
        try:
            ann = parse_ann_cell(r["annotation"])
        except Exception:
            bad_parse += 1
            continue

        spans = word_spans_exact_spaces(text)
        if len(ann) != len(spans):
            bad_len_or_span += 1
            continue

        ok = all((s == ws and e == we) for (s, e, _), (ws, we) in zip(ann, spans))
        if not ok:
            bad_len_or_span += 1
            continue

        tags_str = " ".join(tag for _, _, tag in ann)
        rows.append({"sample": text, "tags": tags_str, "ann_tuples": ann})

    df2 = pd.DataFrame(rows)
    print(f"kept {len(df2)} rows; dropped parse_errors={bad_parse}, span/len_mismatches={bad_len_or_span}")
    if len(df2) == 0:
        raise RuntimeError("No valid rows after validation — check indices/spaces.")

    # Entity label names (without 'O')
    entity_labels = sorted({
        tag.split("-", 1)[-1]
        for ann in df2["ann_tuples"]
        for (_, _, tag) in ann
        if tag != "O"
    })
    print("entity_labels:", entity_labels)

    # Train/val split (safe)
    if len(df2) == 1:
        train_df, val_df = df2.copy(), df2.iloc[[]].copy()
    else:
        val_n = max(1, int(round(len(df2) * val_frac)))
        val_n = min(val_n, len(df2) - 1)
        val_df = df2.sample(n=val_n, random_state=SEED)
        train_df = df2.drop(val_df.index).reset_index(drop=True)
        val_df = val_df.reset_index(drop=True)

    print(f"train={len(train_df)}, val={len(val_df)}")

    def to_messages(text: str, tags: str):
        n = len(tags.split())
        dev = {"role":"developer","content":(
            f"You are an NER tagger. Use BIO with labels: {', '.join(entity_labels)}. "
            f"Return exactly {n} tags separated by a single space — one per whitespace-separated token. "
            "Allowed: O, B-<LABEL>, I-<LABEL>. Do not add or remove tokens."
        )}
        usr = {"role":"user","content": text}
        asst = {"role":"assistant","content": tags}
        return {"messages":[dev, usr, asst]}

    def dump_jsonl(df_in: pd.DataFrame, path: str):
        with open(path, "w", encoding="utf-8") as f:
            for _, r in df_in.iterrows():
                f.write(json.dumps(to_messages(r["sample"], r["tags"]), ensure_ascii=False) + "\n")

    dump_jsonl(train_df, train_path)
    dump_jsonl(val_df,   val_path)

    # Save entity labels for later reuse (inference)
    with open("entity_labels.json", "w", encoding="utf-8") as f:
        json.dump(entity_labels, f, ensure_ascii=False)

    print("Wrote:", train_path, "and", val_path)

make_jsonl_if_needed(DATA_CSV, TRAIN_JSONL, VAL_JSONL, VAL_FRAC, force=FORCE_REGEN)


JSONL already exist; skipping regen. Set FORCE_REGEN=True to rebuild.


In [3]:
from datasets import load_dataset, Features, Value
from transformers import AutoTokenizer

tok_for_render = AutoTokenizer.from_pretrained("openai/gpt-oss-20b", use_fast=True)

def map_row_to_text(ex):
    txt = tok_for_render.apply_chat_template(
        ex["messages"],
        add_generation_prompt=False,
        tokenize=False,
    )
    return {"text": txt}

features = Features({"text": Value("string")})
train_raw = load_dataset("json", data_files=TRAIN_JSONL, split="train")
val_raw   = load_dataset("json", data_files=VAL_JSONL,   split="train")

# Avoid stale cache; and don't use multiprocessing on Windows
train_ds = train_raw.map(map_row_to_text, remove_columns=train_raw.column_names,
                         features=features, load_from_cache_file=False)
val_ds   = val_raw.map(map_row_to_text,   remove_columns=val_raw.column_names,
                       features=features, load_from_cache_file=False)

print(train_ds.features)
print(train_ds[0]["text"][:300])


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 24526/24526 [00:02<00:00, 11869.51 examples/s]
Map: 100%|██████████| 2725/2725 [00:00<00:00, 11109.74 examples/s]

{'text': Value(dtype='string', id=None)}
<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
Knowledge cutoff: 2024-06
Current date: 2025-09-27

Reasoning: medium

# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>developer<|message|># Instructions

Yo





In [4]:
from unsloth import FastLanguageModel

# Use Unsloth's linearized gpt-oss 20B for QLoRA that fits ≈14 GB
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gpt-oss-20b",
    load_in_4bit = True,
    max_seq_length = MAX_LEN,
    dtype = None,            # auto (bf16 if supported)
    device_map = "auto",
)

# Insert LoRA into all linear layers (attn + experts)
model = FastLanguageModel.get_peft_model(
    model,
    r = RANK,
    lora_alpha = 2 * RANK,
    lora_dropout = 0.05,
    target_modules = "all-linear",
    bias = "none",
)

print("Model & tokenizer ready.")



Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel
Exception in thread Thread-5 (_readerthread):
Traceback (most recent call last):
  File "c:\Users\lexan\miniconda3\envs\gpt310\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "c:\Users\lexan\miniconda3\envs\gpt310\lib\site-packages\ipykernel\ipkernel.py", line 772, in run_closure
    _threading_Thread_run(self)
  File "c:\Users\lexan\miniconda3\envs\gpt310\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Users\lexan\miniconda3\envs\gpt310\lib\subprocess.py", line 1515, in _readerthread
    buffer.append(fh.read())
  File "c:\Users\lexan\miniconda3\envs\gpt310\lib\codecs.py", line 322, in decode
    (result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xad in position 7: invalid start byte


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


W0927 18:13:33.274000 25896 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.9.9: Fast Gpt_Oss patching. Transformers: 4.56.2.
   \\   /|    NVIDIA GeForce RTX 4070 Ti SUPER. Num GPUs = 1. Max memory: 15.992 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.64s/it]
Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.


Unsloth: Making `model.base_model.model.model` require gradients
Model & tokenizer ready.


In [None]:
from trl import SFTTrainer, SFTConfig
import torch

bf16_ok = torch.cuda.is_bf16_supported()

sft_args = SFTConfig(
    output_dir=OUTPUT_DIR,

    # токенизация и батчинг
    max_seq_length=MAX_LEN,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=GRAD_ACC,
    packing=PACKING,

    # обучение
    num_train_epochs=2,
    learning_rate=2e-4,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",

    # лог/сейвы
    logging_steps=25,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    report_to="none",

    # precision
    bf16=bf16_ok,
    fp16=not bf16_ok,

    # критично для Windows
    dataset_num_proc=1,
)

trainer = SFTTrainer(
    model=model,
    args=sft_args,
    train_dataset=train_ds,
    # при желании добавь валидацию:
    # eval_dataset=val_ds,  # и тогда ещё sft_args.eval_strategy="steps"
    processing_class=tokenizer,
    dataset_text_field="text",
)

trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Saved adapter to:", OUTPUT_DIR)




The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 199998, 'pad_token_id': 200017}.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 24,526 | Num Epochs = 2 | Total steps = 3,066
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 16 x 1) = 16
 "-____-"     Trainable parameters = 3,981,312 of 20,918,738,496 (0.02% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
25,4.6203
50,1.0812


In [None]:
import json

# Reload labels detected during data prep
with open("entity_labels.json", "r", encoding="utf-8") as f:
    ENTITY_LABELS = json.load(f)

def ner_predict(text: str, labels=None):
    labels = labels or ENTITY_LABELS
    n = len(text.split(" "))
    dev = {"role":"developer","content":(
        f"You are an NER tagger. Use BIO with labels: {', '.join(labels)}. "
        f"Return exactly {n} tags separated by a single space — one per whitespace-separated token. "
        "Allowed: O, B-<LABEL>, I-<LABEL>. Do not add or remove tokens."
    )}
    usr = {"role":"user","content": text}
    msgs = [dev, usr]

    inp = tokenizer.apply_chat_template(msgs, add_generation_prompt=True,
                                        tokenize=True, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inp,
            max_new_tokens=max(8, n*3),
            do_sample=False, temperature=0.0,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
        )
    resp = tokenizer.decode(out[0][inp["input_ids"].shape[1]:], skip_special_tokens=True).strip().splitlines()[0].strip()
    tags = resp.split()
    words = text.split(" ")
    if len(tags) < len(words): tags += ["O"] * (len(words) - len(tags))
    if len(tags) > len(words): tags = tags[:len(words)]
    return list(zip(words, tags))

# Quick smoke test:
# ner_predict("Кола Zero 1.5 литра")


In [None]:
import pandas as pd
from seqeval.metrics import f1_score, classification_report

# Rebuild a tiny DataFrame from val_raw jsonl if needed:
val_texts, val_tags = [], []
import json as _json
with open(VAL_JSONL, "r", encoding="utf-8") as f:
    for line in f:
        obj = _json.loads(line)
        val_texts.append(obj["messages"][1]["content"])
        val_tags.append(obj["messages"][2]["content"])
val_frame = pd.DataFrame({"sample": val_texts, "tags": val_tags})

def quick_eval(val_df: pd.DataFrame, limit: int = 200):
    y_true, y_pred = [], []
    for _, r in val_df.head(limit).iterrows():
        gold = r["tags"].split()
        pred = [t for _, t in ner_predict(r["sample"])]
        y_true.append(gold); y_pred.append(pred)
    print("F1:", f1_score(y_true, y_pred))
    print(classification_report(y_true, y_pred))

# quick_eval(val_frame, limit=200)
