In [1]:
!pip install --upgrade --force-reinstall --no-cache-dir --no-deps unsloth unsloth_zoo

Collecting unsloth
  Downloading unsloth-2025.8.10-py3-none-any.whl.metadata (52 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.3/52.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo
  Downloading unsloth_zoo-2025.8.9-py3-none-any.whl.metadata (9.5 kB)
Downloading unsloth-2025.8.10-py3-none-any.whl (312 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.9/312.9 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading unsloth_zoo-2025.8.9-py3-none-any.whl (196 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m196.0/196.0 kB[0m [31m336.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unsloth_zoo, unsloth
Successfully installed unsloth-2025.8.10 unsloth_zoo-2025.8.9


In [None]:
%%capture
!pip install --upgrade -qqq uv
try:
    import numpy

    install_numpy = f"numpy=={numpy.__version__}"
except:
    install_numpy = "numpy"
!uv pip install -qqq \
    "torch>=2.8.0" "triton>=3.4.0" {install_numpy} \
    "unsloth_zoo[base] @ git+https://github.com/unslothai/unsloth-zoo@nightly" \
    "unsloth[base] @ git+https://github.com/unslothai/unsloth@nightly" \
    torchvision bitsandbytes \
    git+https://github.com/huggingface/transformers \
    git+https://github.com/triton-lang/triton.git@05b2c186c1b6c9a08375389d5efe9cb4c401c075#subdirectory=python/triton_kernels

In [None]:
from google.colab import drive, userdata

drive.mount("/content/drive")

import os
import io
import zipfile
import requests
import sys
import json
import pandas as pd
import tomlkit
import shutil
from glob import glob
from tomlkit import table

LOCAL_WORKSPACES_PATH = "/workspaces/GEST"
DRIVE_WORKSPACES_PATH = "/content/drive/MyDrive/Facultate/UPB/ACS/IA/Disertatie"
DATASET_PATH = f"{DRIVE_WORKSPACES_PATH}/datasets"
ENV_PATH = f"{DRIVE_WORKSPACES_PATH}/env"
os.makedirs(LOCAL_WORKSPACES_PATH, exist_ok=True)
os.makedirs(DRIVE_WORKSPACES_PATH, exist_ok=True)

# Secure token
GITHUB_TOKEN = userdata.get("GITHUB_TOKEN")
if not GITHUB_TOKEN:
    raise ValueError("A GITHUB_TOKEN is required. Please add it to Colab Secrets.")

# Download repo zip
OWNER, REPO, BRANCH = "OctavianJe", "GEST", "main"
api_url = f"https://api.github.com/repos/{OWNER}/{REPO}/zipball/{BRANCH}"
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

print(f"Downloading GEST library from {OWNER}/{REPO}@{BRANCH} …")
r = requests.get(api_url, headers=headers)
r.raise_for_status()
z = zipfile.ZipFile(io.BytesIO(r.content))
root_dir = z.namelist()[0]
for member in z.infolist():
    member.filename = member.filename.replace(root_dir, "", 1)
    if member.filename:
        z.extract(member, LOCAL_WORKSPACES_PATH)
print("GEST library downloaded.")

# Install GEST Dependencies
%cd {LOCAL_WORKSPACES_PATH}
!uv pip install -e '.[all]'
print("GEST installed (editable).")

# Add GEST to Python Path & Configure
src_dir = os.path.join(LOCAL_WORKSPACES_PATH, "src")
if src_dir not in sys.path:
    sys.path.insert(0, src_dir)
print("GEST library added to system path.")

config_file_path = os.path.join(LOCAL_WORKSPACES_PATH, "config.toml")
with open(config_file_path, "r") as f:
    config = tomlkit.load(f)

config["text_similarity"]["models"]["cache_dir"] = os.path.join(
    DRIVE_WORKSPACES_PATH, "fine-tune/models/text_similarity"
)
config["graph_matching"]["embeddings"]["cache_dir"] = os.path.join(
    DRIVE_WORKSPACES_PATH, "fine-tune/models/graph_matching"
)

config["gest"]["data_path"] = os.path.join(DRIVE_WORKSPACES_PATH, "data")
config["gest"]["blacklist_path"] = os.path.join(DRIVE_WORKSPACES_PATH, "data")

# For e2e flow
ci = table()
ci.add("model", "gpt-oss-gest-e2e")
config["llm"]["custom_inference"] = ci

# For generation-only
# ci = table()
# ci.add("model", "gpt-oss-gest-generation-only")
# config["llm"]["custom_inference"] = ci

# cfg = config.setdefault("gest", {}).setdefault("engine", {}).setdefault("improvement", {})
# cfg["skip_improvement_step"] = True

# Save updated config
with open(config_file_path, "w") as f:
    tomlkit.dump(config, f)

#  Copy dataset folder to LOCAL_WORKSPACES_PATH/miscellaneous/datasets
dst_dataset = os.path.join(LOCAL_WORKSPACES_PATH, "miscellaneous", "datasets")
os.makedirs(dst_dataset, exist_ok=True)
for root, _, files in os.walk(DATASET_PATH):
    rel = os.path.relpath(root, DATASET_PATH)
    target_root = os.path.join(dst_dataset, rel) if rel != "." else dst_dataset
    os.makedirs(target_root, exist_ok=True)
    for f in files:
        shutil.copy2(os.path.join(root, f), os.path.join(target_root, f))
print(f"Copied dataset folder from {DATASET_PATH} to {dst_dataset}")

# Copy env file from ENV_PATH to LOCAL_WORKSPACES_PATH as .env
dst_env = os.path.join(LOCAL_WORKSPACES_PATH, ".env")


def pick_env(src: str) -> str:
    if os.path.isfile(src):
        return src
    if os.path.isdir(src):
        # prefer .env, then env, then any *.env / *env*
        for name in [".env", "env"]:
            cand = os.path.join(src, name)
            if os.path.isfile(cand):
                return cand
        matches = glob(os.path.join(src, "*.env")) or glob(os.path.join(src, "*env*"))
        if matches:
            return matches[0]
    raise FileNotFoundError(f"No env file found at: {src}")


shutil.copy2(pick_env(ENV_PATH), dst_env)
print(f"Copied env to {dst_env}")

print("Successfully configured GEST to use cache paths in Google Drive.")


Mounted at /content/drive
Downloading GEST library from OctavianJe/GEST@feature/fine-tunning-generation-flow-LLM …
GEST library downloaded.
/workspaces/GEST
[2mUsing Python 3.12.11 environment at: /usr[0m
[2K[2mResolved [1m93 packages[0m [2min 6.95s[0m[0m
[2K[2mPrepared [1m10 packages[0m [2min 804ms[0m[0m
[2mUninstalled [1m2 packages[0m [2min 92ms[0m[0m
[2K[2mInstalled [1m10 packages[0m [2min 30ms[0m[0m
 [32m+[39m [1mappdirs[0m[2m==1.4.4[0m
 [32m+[39m [1masync-timeout[0m[2m==5.0.1[0m
 [32m+[39m [1menvtoml[0m[2m==0.1.2[0m
 [32m+[39m [1mgest[0m[2m==0.1.0 (from file:///workspaces/GEST)[0m
 [32m+[39m [1mollama[0m[2m==0.5.3[0m
 [31m-[39m [1mpandas[0m[2m==2.2.2[0m
 [32m+[39m [1mpandas[0m[2m==2.3.2[0m
 [32m+[39m [1mpy[0m[2m==1.11.0[0m
 [32m+[39m [1mpygmtools[0m[2m==0.5.3 (from git+https://github.com/Thinklab-SJTU/pygmtools.git@80399b1b07722340dd8d6e1bcce2618395fac2db)[0m
 [32m+[39m [1mretry[0m[2m==0.9.2[0

In [None]:
from unsloth import FastLanguageModel
from trl import SFTTrainer, SFTConfig
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import torch

from gest.data.gest import GEST

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
MAX_SEQ_LEN = 6144
BATCH_SIZE = 4
GRAD_ACCUMULATION_STEPS = 16
TRAIN_EPOCHS = 3
LEARNING_RATE = 2e-4
SEED = 42

LORA_R = 32
LORA_ALPHA = 64
LORA_DROPOUT = 0.0
TARGET_MODULES = [
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "gate_proj",
    "up_proj",
    "down_proj",
]

BASE_MODEL_ID = "unsloth/gpt-oss-20b-unsloth-bnb-4bit"
DTYPE = "float32"

CKPT_DIR = os.path.join(DRIVE_WORKSPACES_PATH, "fine-tune/checkpoints_gptoss20b_gest")
os.makedirs(CKPT_DIR, exist_ok=True)

In [6]:
print("Loading GPT-OSS 20B (Unsloth, 4-bit)…")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL_ID,
    max_seq_length=MAX_SEQ_LEN,
    load_in_4bit=True,
    device_map="auto",
    use_exact_model_name=True,
    attn_implementation="eager",
)

# Save VRAM during training
model.config.use_cache = False

Loading GPT-OSS 20B (Unsloth, 4-bit)…
==((====))==  Unsloth 2025.8.10: Fast Gpt_Oss patching. Transformers: 4.57.0.dev0.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Gpt_Oss does not support SDPA - switching to fast eager.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.37G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.16G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/165 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/27.9M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

In [7]:
print("Adding LoRA…")
model = FastLanguageModel.get_peft_model(
    model,
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    use_gradient_checkpointing="unsloth",
    target_modules=TARGET_MODULES,
    task_type="CAUSAL_LM",
)

Adding LoRA…
Unsloth: Making `model.base_model.model.model` require gradients


In [None]:
CSV_PATH = os.path.join(LOCAL_WORKSPACES_PATH, "data/gest.csv")
MANUAL_CSV_PATH = os.path.join(LOCAL_WORKSPACES_PATH, "data/gest_manual.csv")

print("Loading GEST CSVs…")
df_main = pd.read_csv(CSV_PATH)
df_manual = pd.read_csv(MANUAL_CSV_PATH)

# Sanity check
assert {"dataset", "id", "text", "gest"}.issubset(df_main.columns), (
    "gest.csv needs: dataset,id,text,gest"
)
assert {"dataset", "id", "text", "gest"}.issubset(df_manual.columns), (
    "gest_manual.csv needs: dataset,id,text,gest"
)

# Merge manual uniques into main
df_main["unique_id"] = df_main["dataset"].astype(str) + "_" + df_main["id"].astype(str)
df_manual["unique_id"] = (
    df_manual["dataset"].astype(str) + "_" + df_manual["id"].astype(str)
)
unique_manual_df = df_manual[~df_manual["unique_id"].isin(df_main["unique_id"])]
combined_df = pd.concat([df_main, unique_manual_df], ignore_index=True).drop(
    columns=["unique_id"]
)

# Split 75/25
train_df, test_df = train_test_split(
    combined_df, test_size=0.25, random_state=SEED, shuffle=True
)

# Build system prompt using GEST schema
GEST_JSON_SCHEMA = json.dumps(GEST.model_json_schema(), indent=2)
SYSTEM_PROMPT = f"""You are an expert entity and event extraction system. Your task is to analyze the user's text and output a single raw JSON object that strictly adheres to the provided GEST schema. No explanations or markdown — output only the JSON object.

Here is the required JSON schema:
{GEST_JSON_SCHEMA}
"""


def build_messages(user_text: str, gest_json: str):
    return [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user_text},
        {"role": "assistant", "content": gest_json},
    ]


def formatting_prompts_func(batch):
    """Render to a single "text" column using GPT-OSS chat template, with high reasoning."""
    texts = []
    for t, g in zip(batch["text"], batch["gest"]):
        convo = build_messages(str(t), str(g))
        rendered = tokenizer.apply_chat_template(
            convo,
            tokenize=False,
            add_generation_prompt=False,
            reasoning_effort="high",
        )
        texts.append(rendered)
    return {"text": texts}


print("Building HF datasets…")
ds = DatasetDict(
    {
        "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
        "test": Dataset.from_pandas(test_df.reset_index(drop=True)),
    }
)
print({k: len(v) for k, v in ds.items()})

print("Applying chat template (high reasoning)…")
ds_formatted = DatasetDict(
    {
        "train": ds["train"].map(
            formatting_prompts_func,
            batched=True,
            remove_columns=ds["train"].column_names,
        ),
        "test": ds["test"].map(
            formatting_prompts_func,
            batched=True,
            remove_columns=ds["test"].column_names,
        ),
    }
)
print("Preview first example:\n", ds_formatted["train"][0]["text"])


Loading GEST CSVs…
Building HF datasets…
{'train': 10341, 'test': 3448}
Applying chat template (high reasoning)…


Map:   0%|          | 0/10341 [00:00<?, ? examples/s]

Map:   0%|          | 0/3448 [00:00<?, ? examples/s]

Preview first example:
 <|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
Knowledge cutoff: 2024-06
Current date: 2025-09-03

Reasoning: high

# Valid channels: analysis, commentary, final. Channel must be included for every message.
Calls to these tools must go to the commentary channel: 'functions'.<|end|><|start|>developer<|message|># Instructions

You are an expert entity and event extraction system. Your task is to analyze the user's text and output a single raw JSON object that strictly adheres to the provided GEST schema. No explanations or markdown — output only the JSON object.

Here is the required JSON schema:
{
  "$defs": {
    "Action": {
      "description": "Represents a verb-centric event involving one or more entities.",
      "properties": {
        "action": {
          "description": "Atomic verb performed (non-empty and **not** 'Exists')",
          "title": "Action",
          "type": "string"
        },
        "entities": {
   

In [9]:
trainer_args = SFTConfig(
    output_dir=CKPT_DIR,
    num_train_epochs=TRAIN_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    optim="adamw_torch_fused",
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    weight_decay=0.01,
    seed=SEED,
    logging_steps=5,
    save_strategy="epoch",
    save_total_limit=2,
    report_to="none",
    dataset_text_field="text",
    packing=False,
    max_length=MAX_SEQ_LEN,
    dataloader_num_workers=8,
    dataloader_pin_memory=True,
    dataloader_prefetch_factor=2,
    bf16=torch.cuda.is_bf16_supported(),
    fp16=not torch.cuda.is_bf16_supported(),
    auto_find_batch_size=False,
    gradient_checkpointing=False,
)

trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    train_dataset=ds_formatted["train"],
    args=trainer_args,
)

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/10341 [00:00<?, ? examples/s]

In [None]:
print("Starting fine-tuning…")
trainer.train()
print("Training complete.")

🚂 Starting fine-tuning…


Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/10341 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 199998, 'pad_token_id': 200017}.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10,341 | Num Epochs = 3 | Total steps = 486
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 16 x 1) = 64
 "-____-"     Trainable parameters = 15,925,248 of 20,930,682,432 (0.08% trained)


Step,Training Loss,entropy
5,4.3899,0
10,2.1349,No Log
15,0.6334,No Log
20,0.4527,No Log
25,0.376,No Log
30,0.2885,No Log
35,0.1913,No Log
40,0.104,No Log
45,0.0806,No Log
50,0.0743,No Log


✅ Training complete.


In [10]:
print("Resuming fine-tuning from the latest checkpoint…")
trainer.train(resume_from_checkpoint=True)
print("Training complete.")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 199998, 'pad_token_id': 200017}.


Resuming fine-tuning from the latest checkpoint…


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10,341 | Num Epochs = 3 | Total steps = 486
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 16 x 1) = 64
 "-____-"     Trainable parameters = 15,925,248 of 20,930,682,432 (0.08% trained)


Step,Training Loss,entropy


Training complete.


In [11]:
import re
import json
import torch
from typing import Optional

from gest.service.other.llm.provider.custom_inference_llm_provider import (
    CustomInferenceLLMProvider,
)
from gest.service.other.llm.provider.base_llm_provider import LLMExpectedResultType
from gest.service.other.llm.provider.exception.provider_response_exceptions import (
    ProviderJSONDecodeError,
    ProviderValueError,
)


def _colab_chat_implementation(
    self,
    user_prompt: str,
    expected_result_type: LLMExpectedResultType,
    system_prompt: Optional[str] = None,
):
    if "model" not in globals():
        raise ProviderValueError("`model` must be defined.")

    if "tokenizer" not in globals():
        raise ProviderValueError("`tokenizer` must be defined.")

    if "SYSTEM_PROMPT" not in globals():
        raise ProviderValueError("`SYSTEM_PROMPT` must be defined.")

    if "MAX_SEQ_LEN" not in globals():
        raise ProviderValueError("`MAX_SEQ_LEN` must be defined.")

    _model = globals()["model"]
    _tokenizer = globals()["tokenizer"]
    _SYSTEM_PROMPT = globals()["SYSTEM_PROMPT"]

    max_new_tokens = int(globals()["MAX_SEQ_LEN"])

    _model.eval()
    _model.config.use_cache = True

    convo = [
        {"role": "system", "content": _SYSTEM_PROMPT},
        {"role": "user", "content": user_prompt},
    ]
    prompt = _tokenizer.apply_chat_template(
        convo,
        tokenize=False,
        add_generation_prompt=True,
        reasoning_effort="high",
    )

    inputs = _tokenizer([prompt], return_tensors="pt").to(_model.device)

    with torch.inference_mode():
        out = _model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            eos_token_id=_model.config.eos_token_id,
            pad_token_id=_model.config.pad_token_id,
            return_dict_in_generate=True,
        )

    gen_ids = out.sequences[0, inputs["input_ids"].shape[-1] :]
    text = _tokenizer.decode(gen_ids, skip_special_tokens=False)

    start_tok = "<|start|>assistant<|message|>"
    end_tok = "<|return|>"
    if start_tok in text:
        text = text.split(start_tok, 1)[1]
    if end_tok in text:
        text = text.split(end_tok, 1)[0]

    if expected_result_type == LLMExpectedResultType.STRING:
        return text.strip()

    if expected_result_type == LLMExpectedResultType.JSON:
        m = re.search(r"\{.*\}", text, flags=re.S)
        candidate = text if m is None else m.group(0).strip()
        try:
            return json.loads(candidate)
        except json.JSONDecodeError as exc:
            raise ProviderJSONDecodeError(
                f"The response is not a valid JSON string: {exc}"
            ) from exc

    raise ProviderValueError(
        f"Unsupported expected_result_type: {expected_result_type}"
    )


CustomInferenceLLMProvider._chat_implementation = _colab_chat_implementation

In [None]:
from gest.main import main

main()