### **Setup & paths**

In [1]:
# ===== A) Paths & offline mode =====
from pathlib import Path
import sys, os, platform

CWD  = Path.cwd().resolve()
ROOT = CWD if (CWD / "src").exists() else CWD.parent
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

DATA    = ROOT / "data"
SFT_DIR = DATA / "sft"
OUT     = ROOT / "outputs" / "lora_hf" / "title17"
OUT.mkdir(parents=True, exist_ok=True)

LOCAL_QWEN = ROOT / "models" / "Qwen2.5-1.5B-Instruct"
LOCAL_QWEN.mkdir(parents=True, exist_ok=True)

# This is the ONLY place we set these:
MODEL_ID = str(LOCAL_QWEN)        # <— local path, not hub id
LOCAL_FILES_ONLY = True
os.environ["TRANSFORMERS_OFFLINE"] = "1"  # extra-safe offline

print("ROOT    :", ROOT)
print("SFT_DIR :", SFT_DIR)
print("OUT     :", OUT)
print("MODEL_ID:", MODEL_ID)
print("OFFLINE :", os.environ.get("TRANSFORMERS_OFFLINE"))
print("Python  :", sys.version.split()[0], "|", platform.platform())

ROOT    : D:\IIT BBS\Job Resources\Business Optima\pdf-agent
SFT_DIR : D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft
OUT     : D:\IIT BBS\Job Resources\Business Optima\pdf-agent\outputs\lora_hf\title17
MODEL_ID: D:\IIT BBS\Job Resources\Business Optima\pdf-agent\models\Qwen2.5-1.5B-Instruct
OFFLINE : 1
Python  : 3.11.13 | Windows-10-10.0.26100-SP0


### **Model Download**

In [None]:
# from huggingface_hub import snapshot_download
# from pathlib import Path

# LOCAL_QWEN = Path(ROOT / "models" / "Qwen2.5-1.5B-Instruct")
# LOCAL_QWEN.mkdir(parents=True, exist_ok=True)

# local_dir = snapshot_download(
#     repo_id="Qwen/Qwen2.5-1.5B-Instruct",
#     local_dir=str(LOCAL_QWEN),
#     local_dir_use_symlinks=False,  # avoid symlink perms on Windows
#     allow_patterns=["*.json","*.bin","*.safetensors","*.model","tokenizer*","*.py","*.txt"],
# )

# print("Local model dir:", local_dir)

# # Switch to fully offline usage:
# MODEL_ID = str(LOCAL_QWEN)
# LOCAL_FILES_ONLY = True

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Local model dir: D:\IIT BBS\Job Resources\Business Optima\pdf-agent\models\Qwen2.5-1.5B-Instruct


### **Check/prepare Alpaca data**

In [2]:
# ===== B) Ensure alpaca jsonl exists & peek =====
from src.train.sft_to_alpaca import to_alpaca
ALPACA = SFT_DIR / "alpaca.train.jsonl"

if not ALPACA.exists():
    src_train = SFT_DIR / "train.jsonl"
    assert src_train.exists(), f"Missing SFT source: {src_train}"
    to_alpaca(src_train, ALPACA)

print("ALPACA exists:", ALPACA.exists(), "|", ALPACA)

import json, itertools
for line in itertools.islice(open(ALPACA, "r", encoding="utf-8"), 2):
    print(json.loads(line))

ALPACA exists: True | D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft\alpaca.train.jsonl
{'instruction': '(factual) What is the main rule in Copyright Law United States Copyri > § 115 · Scope of exclusive rights in nondramatic musical works: Compulsory license for making and distributing phonorecords 50?\nPrefer concise, correct answers. End with [pp. 88–92].', 'input': '', 'output': 'The primary purpose must be to distribute them to the public for private use.'}
{'instruction': 'Summarize the section:\nHeading: Copyright Law United States Copyri > § 304 · Duration of copyright: Subsisting copyrights 6\nSummarize as 6–8 concise bullet points. End with [pp. 189–201].', 'input': '', 'output': '- - For copyrights subsisting on January 1, 1978, the original term endures for 28 years.\n- - In certain cases, such as posthumous works or corporate copyrights, a renewal and extension of 67 years is possible.\n- - Eligible claimants include the author if living, their widow/widower o

### **Tokenizer smoke test (catches HF auth/issues early)**

In [4]:
# ===== D) Verify tokenizer loads strictly offline =====
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained(
    MODEL_ID, local_files_only=LOCAL_FILES_ONLY, trust_remote_code=True
)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token
print("Tokenizer OK. eos:", tok.eos_token, "| pad:", tok.pad_token)


Tokenizer OK. eos: <|im_end|> | pad: <|endoftext|>


In [5]:
# ===== E) Lib versions =====
import torch, transformers, datasets, peft, accelerate, os
print("torch        :", torch.__version__, "| cuda?", torch.cuda.is_available())
print("transformers :", transformers.__version__)
print("datasets     :", datasets.__version__)
print("peft         :", peft.__version__)
print("accelerate   :", accelerate.__version__)
print("CPU threads  :", os.cpu_count())

torch        : 2.8.0+cpu | cuda? False
transformers : 4.55.2
datasets     : 4.0.0
peft         : 0.17.1
accelerate   : 1.10.0
CPU threads  : 8


### **Hyperparams (CPU-friendly)**

In [6]:
# ===== F) CPU-safe hparams =====
MAX_STEPS    = 300
BATCH_SIZE   = 1
GRAD_ACCUM   = 4
LR           = 2e-4
MAX_SEQ_LEN  = 512
NUM_THREADS  = min(8, os.cpu_count() or 8)
LORA_R       = 8
LORA_ALPHA   = 16
LORA_DROPOUT = 0.05
SEED         = 7

print(dict(max_steps=MAX_STEPS, batch=BATCH_SIZE, grad_accum=GRAD_ACCUM, lr=LR,
           max_seq_len=MAX_SEQ_LEN, num_threads=NUM_THREADS,
           lora_r=LORA_R, lora_alpha=LORA_ALPHA, lora_dropout=LORA_DROPOUT))

{'max_steps': 300, 'batch': 1, 'grad_accum': 4, 'lr': 0.0002, 'max_seq_len': 512, 'num_threads': 8, 'lora_r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05}


### **Train LoRA (run script, stream logs)**

In [7]:
# ===== G) Launch training (strictly offline) =====
import subprocess, sys, shlex

args = [
    sys.executable, str(ROOT / "src/train/cpu_lora_hf.py"),
    "--model_id", MODEL_ID,                      # local folder
    "--train_jsonl", str(ALPACA),
    "--out_dir", str(OUT),
    "--max_steps", str(MAX_STEPS),
    "--batch_size", str(BATCH_SIZE),
    "--grad_accum", str(GRAD_ACCUM),
    "--lr", str(LR),
    "--max_seq_len", str(MAX_SEQ_LEN),
    "--num_threads", str(NUM_THREADS),
    "--lora_r", str(LORA_R),
    "--lora_alpha", str(LORA_ALPHA),
    "--lora_dropout", str(LORA_DROPOUT),
    "--seed", str(SEED),
    "--local_files_only",
    "--trust_remote_code",
]
print("Launching:", " ".join(map(shlex.quote, args)))
proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
for line in proc.stdout:
    print(line, end="")
code = proc.wait()
print("\n[proc exit code]", code)
if code != 0:
    raise RuntimeError("Training failed; see logs above.")


Launching: 'd:\Anaconda\envs\pdf-agent-2\python.exe' 'D:\IIT BBS\Job Resources\Business Optima\pdf-agent\src\train\cpu_lora_hf.py' --model_id 'D:\IIT BBS\Job Resources\Business Optima\pdf-agent\models\Qwen2.5-1.5B-Instruct' --train_jsonl 'D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft\alpaca.train.jsonl' --out_dir 'D:\IIT BBS\Job Resources\Business Optima\pdf-agent\outputs\lora_hf\title17' --max_steps 300 --batch_size 1 --grad_accum 4 --lr 0.0002 --max_seq_len 512 --num_threads 8 --lora_r 8 --lora_alpha 16 --lora_dropout 0.05 --seed 7 --local_files_only --trust_remote_code

Map:   0%|          | 0/130 [00:00<?, ? examples/s]
Map: 100%|██████████| 130/130 [00:00<00:00, 153.92 examples/s]
Map: 100%|██████████| 130/130 [00:00<00:00, 146.20 examples/s]
trainable params: 9,232,384 || all params: 1,552,946,688 || trainable%: 0.5945
  trainer = Trainer(


  0%|          | 1/300 [00:52<4:20:02, 52.18s/it]
  1%|          | 2/300 [02:24<6:15:05, 75.52s/it]
  1%|          | 3/300 [03

### **Inspect saved adapter files**

In [8]:
# ===== H) Inspect adapter folder =====
adapter_dir = OUT / "adapter"
print("Adapter dir:", adapter_dir, "| exists:", adapter_dir.exists())
if adapter_dir.exists():
    print([p.name for p in adapter_dir.iterdir()])

Adapter dir: D:\IIT BBS\Job Resources\Business Optima\pdf-agent\outputs\lora_hf\title17\adapter | exists: True
['adapter_config.json', 'adapter_model.safetensors', 'added_tokens.json', 'chat_template.jinja', 'merges.txt', 'README.md', 'special_tokens_map.json', 'tokenizer.json', 'tokenizer_config.json', 'vocab.json']


### **Merging LoRA into base for single-file inference**

In [None]:
import subprocess, sys, shlex
MERGED_DIR = ROOT / "outputs" / "lora_hf" / "title17_merged"
args = [
    sys.executable, str(ROOT / "src/train/merge_lora.py"),
    "--base_model", MODEL_ID,                   # local base directory
    "--lora_dir", str(OUT / "adapter"),
    "--out_dir", str(MERGED_DIR),
    "--local_files_only",
    "--trust_remote_code",
]
print("Merging:", " ".join(map(shlex.quote, args)))
print(subprocess.run(args, capture_output=True, text=True).stdout)

Merging: 'd:\Anaconda\envs\pdf-agent-2\python.exe' 'D:\IIT BBS\Job Resources\Business Optima\pdf-agent\src\train\merge_lora.py' --base_model 'D:\IIT BBS\Job Resources\Business Optima\pdf-agent\models\Qwen2.5-1.5B-Instruct' --lora_dir 'D:\IIT BBS\Job Resources\Business Optima\pdf-agent\outputs\lora_hf\title17\adapter' --out_dir 'D:\IIT BBS\Job Resources\Business Optima\pdf-agent\outputs\lora_hf\title17_merged' --local_files_only --trust_remote_code
[OK] merged model saved to: D:\IIT BBS\Job Resources\Business Optima\pdf-agent\outputs\lora_hf\title17_merged



### **Inference with *adapter* (no merge)**

In [11]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

base_path = MODEL_ID
adapter_dir = OUT / "adapter"

tok = AutoTokenizer.from_pretrained(base_path, local_files_only=True, trust_remote_code=True)
if tok.pad_token is None: tok.pad_token = tok.eos_token

base = AutoModelForCausalLM.from_pretrained(
    base_path,
    torch_dtype=torch.float32,
    local_files_only=True,
    trust_remote_code=True,
    device_map={"": "cpu"},
)
model = PeftModel.from_pretrained(base, str(adapter_dir), local_files_only=True)
model.eval()

prompt = "Summarize § 114 and note the performance right caveat. End with [pp. 67–88]."
inputs = tok(prompt, return_tensors="pt")
with torch.no_grad():
    out = model.generate(**inputs, max_new_tokens=180, do_sample=False)
print(tok.decode(out[0], skip_special_tokens=True))


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Summarize § 114 and note the performance right caveat. End with [pp. 67–88]. ### Section 114: Scope of exclusive rights: Secondary transmissions by cable of local television programming
#### (b) Performance for secondary transmission by cable.-
-  *(1)* The exclusive rights granted by this section shall not apply to a performance made by a person other than a natural person described in paragraph (1)(C), engaged in a secondary transmission by cable of local television programming.
- *(2)* In addition, no person shall engage in a secondary transmission by cable of local television programming that is made without authorization under this section or section 106 of the copyright Act [see footnote 2 for source], except as provided in subparagraph (D).*
- *(3)* A secondary transmission by cable of local television programming is a transmission of a performance embodied in a primary transmission of a broadcast station to the private home of a subscriber residing outside the local service are

### **Inference with *merged* model**

In [12]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

MERGED_DIR = ROOT / "outputs" / "lora_hf" / "title17_merged"
tok = AutoTokenizer.from_pretrained(str(MERGED_DIR), local_files_only=True, trust_remote_code=True)
if tok.pad_token is None: tok.pad_token = tok.eos_token
model = AutoModelForCausalLM.from_pretrained(
    str(MERGED_DIR),
    torch_dtype=torch.float32,
    local_files_only=True,
    trust_remote_code=True,
    device_map={"": "cpu"},
)
model.eval()

prompt = "Give 3 bullets on § 1201 anticircumvention. End with [pp. 314–317]."
inputs = tok(prompt, return_tensors="pt")
with torch.no_grad():
    out = model.generate(**inputs, max_new_tokens=180, do_sample=False)
print(tok.decode(out[0], skip_special_tokens=True))


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Give 3 bullets on § 1201 anticircumvention. End with [pp. 314–317]. The anticyclus was designed to protect intellectual property rights by prohibiting counterfeiting and infringement actions against domestic products that comply with relevant laws and regulations.
The anticyclus has undergone several amendments, including the Berne Convention Implementation Act (1988), the Architectural Works Copyright Protection Act (1990), the Anticounterfeiting Consumer Product Act (1996), the Online Copyright Infringement Liability Limitation Act (2000), the Online Illegal Activities Elimination Act (2005), and the Prioritizing Resources to Combat Counterfeit Goods Act of 2008 (2008).


### **Pinning a small README next to the adapter**

In [14]:
from datetime import datetime
readme = OUT / "README.txt"
readme.write_text(
    f"""Title 17 LoRA (CPU, PEFT)
Model: {MODEL_ID}
Created: {datetime.now().isoformat()}
Cmd: cpu_lora_hf.py --model_id "{MODEL_ID}" --train_jsonl "{ALPACA}" --out_dir "{OUT}"
     --max_steps {MAX_STEPS} --batch_size {BATCH_SIZE} --grad_accum {GRAD_ACCUM}
     --lr {LR} --max_seq_len {MAX_SEQ_LEN} --num_threads {NUM_THREADS}
     --lora_r {LORA_R} --lora_alpha {LORA_ALPHA} --lora_dropout {LORA_DROPOUT} --seed {SEED}
""",
    encoding="utf-8"
)
print(f"Wrote {readme}")

Wrote D:\IIT BBS\Job Resources\Business Optima\pdf-agent\outputs\lora_hf\title17\README.txt
