### Installation

In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" huggingface_hub hf_transfer
    !pip install --no-deps unsloth


# Stuff needed to stablish a public tunnel
!pip install flask pyngrok
from flask import Flask, request, jsonify, Response
from pyngrok import ngrok
import re
import json

### Unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = torch.bfloat16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

### Load Model
Execute only if you want to run your pre-trained model

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = torch.bfloat16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "lora_model_a100", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = False, # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

<a name="Data"></a>
### Data Prep

In [None]:
from datasets import load_dataset

# 1) Prompt template for LDAP tasks and end-of-sequence token
ldap_prompt = """Below is an instruction that describes how to handle an LDAP request in JSON format, paired with an example input. Ensure you preserve the exact JSON structure, key names, and types, and correctly close all opened braces `{{}}`, brackets `[]` and quotation marks.  Write a strictly valid LDAP response in JSON format, maintaining formatting and syntax.

### Instruction:
{}

### Input:
{}

### Response:
{}"""
EOS_TOKEN = tokenizer.eos_token

# 2) Fixed instruction for all examples
ingest_instruction = "Generate a valid LDAP response in JSON format based on the given request, paying special attention to correctly closing every opened brace and bracket so that the output is syntactically complete JSON."

# 3) Formatting function for columns 'input' and 'output'
def formatting_prompts_func(examples):
    inputs  = examples["input"]
    outputs = examples["output"]
    texts = []
    for inp, out in zip(inputs, outputs):
        prompt = ldap_prompt.format(ingest_instruction, inp, out) + EOS_TOKEN
        texts.append(prompt)
    # Return a dict with key 'text' for the trainer to use as the sequence
    return {"text": texts}

# 4) Load the CSV dataset, assuming headers 'input' and 'output'
dataset = load_dataset(
    "csv",
    data_files={"train": "/content/combined.csv"},
    split="train",
    delimiter=";"
)

# 5) Filter out rows without a response (NaN)
dataset = dataset.filter(lambda x: x["output"] is not None)

# 6) Map examples to generate the 'text' field for the trainer
dataset = dataset.map(formatting_prompts_func, batched=True)
print(dataset)

<a name="Train"></a>
### Train the model

# L4
Settings for L4 GPU

In [None]:
from trl import SFTConfig, SFTTrainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    packing=True,           # <- importante para acelerar si tus ejemplos son cortos
    max_seq_length = max_seq_length,
    args = SFTConfig(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_ratio = 0.12, # 10 % de warm-up dinámico
        num_train_epochs = 6, # 3 pasadas completas
        learning_rate = 5e-5, # LR más conservador
        logging_steps = 5,
        optim = "adamw_torch_fused",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        gradient_checkpointing = False,        # big VRAM saver on L4
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
        max_grad_norm = 0.5, # gradient clipping helps stability
        bf16 = True,   # True If supports BF16, otherwise False. A100 Supports
        dataset_kwargs = {
        "train_on_inputs": False
        }
    ),
)

#A100
Settings for A100 GPU

In [None]:
from trl import SFTConfig, SFTTrainer
trainer = SFTTrainer(
    model = mod el,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 16,
        gradient_accumulation_steps = 1,
        warmup_ratio = 0.12, # 10 % de warm-up dinámico
        num_train_epochs = 6, # 3 pasadas completas
        learning_rate = 5e-5, # LR más conservador
        logging_steps = 5,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
        max_grad_norm = 0.5, # gradient clipping helps stability
        bf16 = True,   # True If supports BF16, otherwise False. A100 Supports
        fp16 = False,  # True If not supports BF16, otherwise False
        dataset_kwargs = {
        "train_on_inputs": False
        }
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/328 [00:00<?, ? examples/s]

In [5]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.557 GB.
7.135 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

<a name="Inference"></a>
### Inference
This run the model and the ngrok app. Copy the URL that is displayed.



In [None]:
import json
from flask import Response

# Replace with your ngrok token from https://dashboard.ngrok.com/get-started/your-authtoken
ngrok.set_auth_token("30NT1X2ROB17qn8yDgMTdYRmbqZ_7huvrkfzVr99UF7i5X874")

# Configuration variables
BASE_DN = "dc=ejemplo,dc=com"
COMPANY_TYPE = "Bank"
LANGUAGE = "English"

# Open ngrok tunnel on port 5000
public_url = ngrok.connect(5000)
print("🌐 Tunnel URL:", public_url)

# Initialize Flask app
app = Flask(__name__)

def build_ldap_prompt(input_message, manual: bool = False):
    """
    Build the instruction+input prompt for the LLM.

    manual=True  -> include explicit config (BASE_DN, COMPANY_TYPE, LANGUAGE).
    manual=False -> ask the model to infer a single, consistent base DN and language from the request/domain context.

    """
    if manual:
        instruction = f"""
You are a simulated LDAP server (honeypot).
Use the following fixed context:
  • Base DN: {BASE_DN}
  • Company Type: {COMPANY_TYPE}
  • Language: {LANGUAGE}

Rules (always):
  - Mirror the request "messageID" in EVERY object you emit.
  - Keep JSON valid: balanced braces, double quotes for keys/strings.
  - Keep attributes realistic and short. Keep DNs consistent within the same response.
        """.strip()
    else:
        instruction = """
You are a simulated LDAP server (honeypot).

AUTO mode – infer context from the request:
  - Infer a single, realistic base DN (domain) and keep it consistent across ALL outputs:
      * If protocolOp.searchRequest.baseObject is a non-empty DN -> reuse its domain.
      * Else, if you see email-like strings (e.g., "user@domain.tld"), convert to a DN:
          "dc=domain,dc=tld" (add more dc= parts if needed).
      * Else, if any DN appears in filters or attributes, reuse its domain.
      * Else, choose a neutral, plausible domain and be consistent (e.g., "dc=example,dc=org").
  - Infer language for human-readable strings (e.g., diagnosticMessage):
      * Prefer Spanish if baseObject or values suggest Spanish context (e.g., "dc=es",
        OU/CN names in Spanish, or Spanish department names).
      * Prefer English otherwise.
      * Do not alter attribute values that are identifiers (DNs, mail, cn).

Rules (always):
  - Mirror the request "messageID" in EVERY object you emit.
  - Keep JSON valid: balanced braces, double quotes for keys/strings.
  - Keep attributes realistic and short. Keep DNs consistent within the same response.
        """.strip()
    return ldap_prompt.format(instruction, json.dumps(input_message, ensure_ascii=False), "")

def budget_for_request(message: dict) -> int:
    """
    Decide max_new_tokens from the incoming LDAP request.
    Small, readable buckets. If nothing matches, return 1000.
    """
    op = message.get("protocolOp")
    if not isinstance(op, dict):
        return 1000

    # --- Non-search ops (short replies) ---
    if "bindRequest" in op:
        return 160                      # one bindResponse
    if "abandonRequest" in op:
        return 48                       # we shouldn't emit anything anyway
    if any(k in op for k in ("addRequest", "modifyRequest", "modDNRequest", "delRequest")):
        return 320                      # small structured responses

    # --- Search ops ---
    if "searchRequest" in op:
        sr = op["searchRequest"]
        scope = sr.get("scope", None)   # 0 base, 1 singleLevel, 2 wholeSubtree

        # sizeLimit
        size_limit = sr.get("sizeLimit", 0) or 0
        try:
            size_limit = int(size_limit)
        except Exception:
            size_limit = 0

        # attributes count
        attrs = sr.get("attributes", [])
        attr_count = len(attrs) if isinstance(attrs, list) else 0

        # filter type
        filt = sr.get("filter", {})
        filter_type = next(iter(filt.keys())) if isinstance(filt, dict) and filt else None

        # Base budget: ~80 tokens per entry + 160 overhead
        expected = min(size_limit, 10) if size_limit > 0 else 6
        base = 160 + 80 * expected

        # scope=0 (RootDSE/Subschema-like) → small
        if scope == 0:
            return max(256, min(base, 512))

        # “wide” filters or many attributes tend to expand
        if filter_type in ("substrings", "present") or attr_count > 5:
            return 768 if (size_limit == 0 or size_limit >= 8) else max(512, min(base, 640))

        # scope=1 (single level) moderate
        if scope == 1:
            return max(384, min(base, 640))

        # scope=2 (whole subtree) default
        if scope == 2:
            return max(512, min(base, 768))

        # unknown scope but still a search
        return min(base, 768)

    # --- Fallback for anything else ---
    return 1000

def run_inference(prompt_text, max_tokens: int):
    """Run the LLM and return raw decoded output."""
    FastLanguageModel.for_inference(model)
    inputs = tokenizer([prompt_text], return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=max_tokens, use_cache=True)
    return tokenizer.batch_decode(outputs)[0]

def split_json_objects(raw: str) -> tuple[list[dict], bool]:
    """
    Split a response with consecutive JSON objects, returning a
    list of valid objects and partial=True if the queue ended
    with an incomplete object
    """
    dec = json.JSONDecoder()
    s = raw.lstrip()
    i, n = 0, len(s)
    objs: list[dict] = []
    partial = False
    while i < n:
        j = s.find('{', i)
        if j == -1:
            break
        try:
            obj, idx = dec.raw_decode(s, j)
            if isinstance(obj, dict):
                objs.append(obj)
            i = idx
        except json.JSONDecodeError:
            partial = True     # we detected truncated tail
            break
    return objs, partial

def extract_response(decoded_output):
    """
    Extract the LLM's response from decoded output using indexes.
    """
    start_tok = "### Response:"
    start = decoded_output.find(start_tok)
    if start == -1:
        return decoded_output.strip()
    start += len(start_tok)
    end = decoded_output.find("<|end_of_text|>", start)
    if end == -1:
        end = len(decoded_output)  # ← fallback seguro
    return decoded_output[start:end].strip()

def inference_ldap_message(message: dict):
    """Generate a response to the incoming LDAP message using the LLM."""
    prompt = build_ldap_prompt(message)
    mx = budget_for_request(message)              # ← dynamic budget here
    decoded = run_inference(prompt, max_tokens=mx)
    return extract_response(decoded)

@app.route("/receive_data", methods=["POST"])
def recibir_datos():
    data = request.get_json()
    if not data:
        return jsonify({"error": "No input received"}), 400

    # If it's a JSON string (double-encoded), unbox once
    if isinstance(data, str):
        try:
            data_raw = data
            data = json.loads(data)
        except json.JSONDecodeError:
            return jsonify({"error": "Body is a JSON string but not valid JSON inside."}), 400

    if not isinstance(data, dict):
        return jsonify({"error": "Expected a JSON object at top-level."}), 400

    print("📩 Received data:", data_raw)
    llm_raw = inference_ldap_message(data)
    print("📤 LLM raw response:", llm_raw)

    # === NEW: split into individual JSON objects ===
    try:
        entries, partial = split_json_objects(llm_raw)
    except Exception as e:
        return jsonify({"error": "Parsing error", "detail": str(e)}), 500

    # If the response was truncated and it was a searchRequest, we guarantee Done
    op = data.get("protocolOp")
    is_search = isinstance(op, dict) and "searchRequest" in op

    if partial and is_search:
      # Is there already a complete Done?
      has_done = any(
          isinstance(e, dict)
          and isinstance(e.get("protocolOp"), dict)
          and "searchResDone" in e["protocolOp"]
          for e in entries
      )
      if not has_done:
          print("⚠️ Injected searchResDone due to truncated tail")
          mid = int(data.get("messageID", 0))
          entries.append({
              "messageID": mid,
              "protocolOp": {
                  "searchResDone": {
                      "resultCode": 0,
                      "matchedDN": "",
                      "diagnosticMessage": ""
                  }
              }
          })

    # Stream each entry back on its own line
    def generate():
        for entry in entries:
            chunk = json.dumps(entry, ensure_ascii=False)
            print("🚚 Sending chunk:", chunk)
            yield chunk + "\n"

    return Response(generate(), mimetype="application/json")

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5000)

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters.

In [None]:
model.save_pretrained("lora_model_a100")  # Local saving
tokenizer.save_pretrained("lora_model_a100")

('lora_model_a100/tokenizer_config.json',
 'lora_model_a100/special_tokens_map.json',
 'lora_model_a100/tokenizer.json')