Skip to content

Commit de99b9c

Browse files
security: harden error handling, token exposure, URL ingest, and deps
1 parent 4f858bc commit de99b9c

18 files changed

+252
-62
lines changed

backend/app/chunking.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ def normalize(text: str) -> str:
1414
text = text.replace("\u00a0", " ")
1515
text = text.replace("\r\n", "\n").replace("\r", "\n")
1616
text = re.sub(r"[ \t]+", " ", text)
17-
text = re.sub(r"[ \t]+\n", "\n", text)
17+
# Trim trailing horizontal whitespace per line without regex backtracking risk.
18+
text = "\n".join(line.rstrip(" \t") for line in text.split("\n"))
1819
text = re.sub(r"\n{3,}", "\n\n", text)
1920
return text.strip()
2021

backend/app/ingest.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,10 @@ def _validate_remote_url_sync(url: str) -> None:
168168
raise RuntimeError("Only http/https URLs are allowed.")
169169
if not parsed.hostname:
170170
raise RuntimeError("URL must include a hostname.")
171+
if parsed.username or parsed.password:
172+
raise RuntimeError("URLs with embedded credentials are not allowed.")
173+
if parsed.port and parsed.port not in (80, 443):
174+
raise RuntimeError("Only standard ports 80/443 are allowed for URL ingest.")
171175
if _is_blocked_host_label(parsed.hostname):
172176
raise RuntimeError("Local hostnames are blocked for URL ingest.")
173177

@@ -373,6 +377,7 @@ async def _download_html_with_diagnostics(url: str) -> Tuple[str, Dict[str, Any]
373377

374378
async with httpx.AsyncClient(follow_redirects=False, timeout=URL_TIMEOUT) as client:
375379
while True:
380+
current_host = (urlparse(current_url).hostname or "").strip().lower().rstrip(".")
376381
if current_url in seen_urls:
377382
raise RuntimeError("Redirect loop detected.")
378383
seen_urls.add(current_url)
@@ -399,7 +404,11 @@ async def _download_html_with_diagnostics(url: str) -> Tuple[str, Dict[str, Any]
399404
if _is_redirect_status(response.status_code):
400405
if redirects >= MAX_URL_REDIRECTS:
401406
raise RuntimeError(f"Too many redirects (>{MAX_URL_REDIRECTS}).")
402-
current_url = _resolve_redirect_url(str(response.url), response.headers.get("location", ""))
407+
next_url = _resolve_redirect_url(str(response.url), response.headers.get("location", ""))
408+
next_host = (urlparse(next_url).hostname or "").strip().lower().rstrip(".")
409+
if next_host != current_host:
410+
raise RuntimeError("Cross-host redirects are blocked for URL ingest.")
411+
current_url = next_url
403412
redirects += 1
404413
diagnostics["redirect_count"] = redirects
405414
diagnostics["final_url"] = current_url

backend/flashcard_store.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import secrets
77
import threading
88
from datetime import datetime, timedelta, timezone
9+
from pathlib import Path
910
from typing import Any, Dict, List, Optional
1011

1112
from app.atomic_io import atomic_write_json, path_lock
@@ -20,9 +21,19 @@ def _safe_owner(owner: str) -> str:
2021
return owner[:128]
2122

2223

24+
def _resolve_under_base(base_dir: str, name: str) -> str:
25+
base = Path(base_dir).resolve()
26+
candidate = (base / name).resolve()
27+
try:
28+
candidate.relative_to(base)
29+
except ValueError as e:
30+
raise ValueError("path_outside_base_dir") from e
31+
return str(candidate)
32+
33+
2334
def _path(owner: str, base_dir: str = DEFAULT_DIR) -> str:
2435
os.makedirs(base_dir, exist_ok=True)
25-
return os.path.join(base_dir, f"{_safe_owner(owner)}.json")
36+
return _resolve_under_base(base_dir, f"{_safe_owner(owner)}.json")
2637

2738

2839
def _now_dt() -> datetime:

backend/mcp_server.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,11 +191,12 @@ async def sift_generate(
191191
out = await anyio.to_thread.run_sync(lambda: generate_with_claude(prompt, model=model or "claude-3-5-sonnet-latest"))
192192
else:
193193
out = await anyio.to_thread.run_sync(lambda: generate_with_openai(prompt, model=model or "gpt-4.1-mini"))
194-
except Exception as e:
194+
except Exception:
195195
logger.exception("mcp_sift_generate_failed owner=%s provider=%s", owner, provider)
196196
return {
197197
"ok": False,
198-
"error": str(e),
198+
"error": "generation_failed",
199+
"message": "Generation failed for the selected provider.",
199200
"hint": "If you're using Codex, call `search` and let Codex generate the final response.",
200201
"sources": res["results"],
201202
}

backend/quiz_store.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import re
66
import secrets
77
import threading
8+
from pathlib import Path
89
from typing import Any, Dict, List
910

1011
from app.atomic_io import atomic_write_json, path_lock
@@ -19,9 +20,19 @@ def _safe_owner(owner: str) -> str:
1920
return owner[:128]
2021

2122

23+
def _resolve_under_base(base_dir: str, name: str) -> str:
24+
base = Path(base_dir).resolve()
25+
candidate = (base / name).resolve()
26+
try:
27+
candidate.relative_to(base)
28+
except ValueError as e:
29+
raise ValueError("path_outside_base_dir") from e
30+
return str(candidate)
31+
32+
2233
def _path(owner: str, base_dir: str = DEFAULT_DIR) -> str:
2334
os.makedirs(base_dir, exist_ok=True)
24-
return os.path.join(base_dir, f"{_safe_owner(owner)}.json")
35+
return _resolve_under_base(base_dir, f"{_safe_owner(owner)}.json")
2536

2637

2738
def load_attempts(owner: str, base_dir: str = DEFAULT_DIR) -> List[Dict[str, Any]]:

backend/requirements.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,19 @@ fastapi==0.115.8
22
uvicorn[standard]==0.30.6
33
pydantic>=2.10.1
44
pydantic-settings>=2.6.1
5-
python-multipart==0.0.9
5+
python-multipart==0.0.22
66

77
openai==1.61.0
88

99
chromadb==0.5.5
1010

1111
beautifulsoup4==4.12.3
1212
httpx==0.27.0
13-
pypdf==4.3.1
13+
pypdf==6.7.3
1414
tiktoken==0.12.0
15-
mcp==1.2.0
15+
mcp==1.23.0
1616
anthropic==0.45.2
17-
jinja2==3.1.4
17+
jinja2==3.1.6
1818
pillow>=10.4.0
1919
pytesseract>=0.3.10
2020
pdf2image>=1.17.0

backend/session_store.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import os
55
import re
66
import threading
7+
from pathlib import Path
78
from typing import Any, Dict, List
89

910
from app.atomic_io import atomic_write_json, path_lock
@@ -18,9 +19,19 @@ def _safe_owner(owner: str) -> str:
1819
return owner[:128]
1920

2021

22+
def _resolve_under_base(base_dir: str, name: str) -> str:
23+
base = Path(base_dir).resolve()
24+
candidate = (base / name).resolve()
25+
try:
26+
candidate.relative_to(base)
27+
except ValueError as e:
28+
raise ValueError("path_outside_base_dir") from e
29+
return str(candidate)
30+
31+
2132
def session_path(owner: str, base_dir: str = DEFAULT_DIR) -> str:
2233
os.makedirs(base_dir, exist_ok=True)
23-
return os.path.join(base_dir, f"{_safe_owner(owner)}.json")
34+
return _resolve_under_base(base_dir, f"{_safe_owner(owner)}.json")
2435

2536

2637
def load_session(owner: str, base_dir: str = DEFAULT_DIR) -> List[Dict[str, Any]]:

backend/source_store.py

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import secrets
66
import threading
77
from glob import glob
8+
from pathlib import Path
89
from typing import Any, Dict, List, Optional
910

1011
from app.atomic_io import atomic_write_json, path_lock
@@ -19,13 +20,23 @@ def _safe_owner(owner: str) -> str:
1920
return owner[:128]
2021

2122

23+
def _resolve_under_base(base_dir: str, *parts: str) -> str:
24+
base = Path(base_dir).resolve()
25+
candidate = (base.joinpath(*parts)).resolve()
26+
try:
27+
candidate.relative_to(base)
28+
except ValueError as e:
29+
raise ValueError("path_outside_base_dir") from e
30+
return str(candidate)
31+
32+
2233
def _owner_manifest_path(owner: str, base_dir: str = DEFAULT_DIR) -> str:
2334
os.makedirs(base_dir, exist_ok=True)
24-
return os.path.join(base_dir, f"{_safe_owner(owner)}.json")
35+
return _resolve_under_base(base_dir, f"{_safe_owner(owner)}.json")
2536

2637

2738
def _owner_files_dir(owner: str, base_dir: str = DEFAULT_DIR) -> str:
28-
path = os.path.join(base_dir, "_files", _safe_owner(owner))
39+
path = _resolve_under_base(base_dir, "_files", _safe_owner(owner))
2940
os.makedirs(path, exist_ok=True)
3041
return path
3142

@@ -185,7 +196,8 @@ def batch_mutate_items(
185196

186197
def write_text_blob(owner: str, source_id: str, text: str, base_dir: str = DEFAULT_DIR) -> str:
187198
files_dir = _owner_files_dir(owner, base_dir)
188-
path = os.path.join(files_dir, f"{source_id}.txt")
199+
safe_source_id = re.sub(r"[^a-zA-Z0-9._-]+", "_", (source_id or "").strip())[:120] or "source"
200+
path = _resolve_under_base(files_dir, f"{safe_source_id}.txt")
189201
with _LOCK:
190202
parent = os.path.dirname(path)
191203
if parent:
@@ -195,18 +207,21 @@ def write_text_blob(owner: str, source_id: str, text: str, base_dir: str = DEFAU
195207
return path
196208

197209

198-
def read_text_blob(path: str) -> str:
210+
def read_text_blob(path: str, base_dir: str = DEFAULT_DIR) -> str:
199211
try:
200-
with open(path, "r", encoding="utf-8") as f:
212+
resolved = Path(path or "").resolve()
213+
resolved.relative_to(Path(base_dir).resolve())
214+
with open(resolved, "r", encoding="utf-8") as f:
201215
return f.read()
202216
except Exception:
203217
return ""
204218

205219

206220
def write_binary_blob(owner: str, source_id: str, filename: str, data: bytes, base_dir: str = DEFAULT_DIR) -> str:
207221
files_dir = _owner_files_dir(owner, base_dir)
222+
safe_source_id = re.sub(r"[^a-zA-Z0-9._-]+", "_", (source_id or "").strip())[:120] or "source"
208223
safe = re.sub(r"[^a-zA-Z0-9._-]+", "_", (filename or "upload").strip())[:180] or "upload"
209-
path = os.path.join(files_dir, f"{source_id}__{safe}")
224+
path = _resolve_under_base(files_dir, f"{safe_source_id}__{safe}")
210225
with _LOCK:
211226
parent = os.path.dirname(path)
212227
if parent:
@@ -216,9 +231,13 @@ def write_binary_blob(owner: str, source_id: str, filename: str, data: bytes, ba
216231
return path
217232

218233

219-
def remove_file(path: str) -> None:
234+
def remove_file(path: str, base_dir: str = DEFAULT_DIR) -> None:
220235
try:
221-
if path and os.path.exists(path):
222-
os.remove(path)
236+
if not path:
237+
return
238+
resolved = Path(path).resolve()
239+
resolved.relative_to(Path(base_dir).resolve())
240+
if os.path.exists(resolved):
241+
os.remove(resolved)
223242
except Exception:
224243
pass

backend/study_store.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import re
66
import secrets
77
import threading
8+
from pathlib import Path
89
from typing import Any, Dict, List, Optional
910

1011
from app.atomic_io import atomic_write_json, path_lock
@@ -19,9 +20,19 @@ def _safe_owner(owner: str) -> str:
1920
return owner[:128]
2021

2122

23+
def _resolve_under_base(base_dir: str, name: str) -> str:
24+
base = Path(base_dir).resolve()
25+
candidate = (base / name).resolve()
26+
try:
27+
candidate.relative_to(base)
28+
except ValueError as e:
29+
raise ValueError("path_outside_base_dir") from e
30+
return str(candidate)
31+
32+
2233
def library_path(owner: str, base_dir: str = DEFAULT_DIR) -> str:
2334
os.makedirs(base_dir, exist_ok=True)
24-
return os.path.join(base_dir, f"{_safe_owner(owner)}.json")
35+
return _resolve_under_base(base_dir, f"{_safe_owner(owner)}.json")
2536

2637

2738
def load_library(owner: str, base_dir: str = DEFAULT_DIR) -> List[Dict[str, Any]]:

backend/templates/chat.html

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -849,7 +849,11 @@
849849
const res = await fetch(u);
850850
if (!(await ensureAuth(res))) return;
851851
if (!res.ok) {
852-
librarySelectListEl.innerHTML = `<div class="hint">Failed to load library sources.</div>`;
852+
librarySelectListEl.innerHTML = "";
853+
const hint = document.createElement("div");
854+
hint.className = "hint";
855+
hint.textContent = "Failed to load library sources.";
856+
librarySelectListEl.appendChild(hint);
853857
return;
854858
}
855859
const data = await res.json();

0 commit comments

Comments
 (0)