Skip to content

Commit 30792a4

Browse files
committed
added check for duplicate headers
1 parent 25b7e68 commit 30792a4

File tree

1 file changed

+47
-17
lines changed
  • backend/app/crud/evaluations

1 file changed

+47
-17
lines changed

backend/app/crud/evaluations/core.py

Lines changed: 47 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -56,28 +56,48 @@ def upload_dataset_to_langfuse(
5656
)
5757

5858
try:
59-
# Get Langfuse client
60-
try:
61-
langfuse = get_langfuse_client(
62-
session=_session,
63-
org_id=_current_user.organization_id,
64-
project_id=_current_user.project_id,
65-
)
66-
except HTTPException as http_exc:
67-
return False, None, http_exc.detail
68-
69-
# Parse CSV content
59+
# Parse CSV content first (fail fast on invalid CSV)
7060
csv_text = csv_content.decode("utf-8")
7161
csv_reader = csv.DictReader(io.StringIO(csv_text))
7262

73-
# Normalize headers: strip whitespace and lowercase for flexible matching
74-
if csv_reader.fieldnames:
75-
clean_headers = {
76-
field.strip().lower(): field for field in csv_reader.fieldnames
77-
}
78-
else:
63+
# Validate that CSV has headers
64+
if not csv_reader.fieldnames:
7965
return False, None, "CSV file has no headers"
8066

67+
# Normalize headers and detect duplicates in a single pass
68+
# Build mapping of normalized name -> list of original headers
69+
normalized_to_originals = {}
70+
for field in csv_reader.fieldnames:
71+
normalized = field.strip().lower()
72+
if normalized not in normalized_to_originals:
73+
normalized_to_originals[normalized] = []
74+
normalized_to_originals[normalized].append(field)
75+
76+
# Check for duplicate normalized headers
77+
duplicates = {
78+
norm: originals
79+
for norm, originals in normalized_to_originals.items()
80+
if len(originals) > 1
81+
}
82+
83+
if duplicates:
84+
# Build clear error message showing which headers conflict
85+
duplicate_groups = [
86+
f"{originals} (all normalize to '{norm}')"
87+
for norm, originals in duplicates.items()
88+
]
89+
return (
90+
False,
91+
None,
92+
f"CSV contains duplicate columns after normalization: {'; '.join(duplicate_groups)}. "
93+
"Please ensure all column names are unique (case-insensitive).",
94+
)
95+
96+
# Use the normalized headers for validation
97+
clean_headers = {
98+
norm: originals[0] for norm, originals in normalized_to_originals.items()
99+
}
100+
81101
# Validate CSV headers using normalized names
82102
if "question" not in clean_headers or "answer" not in clean_headers:
83103
return (
@@ -87,6 +107,16 @@ def upload_dataset_to_langfuse(
87107
f"Found columns: {csv_reader.fieldnames}",
88108
)
89109

110+
# Get Langfuse client (after CSV validation to fail fast)
111+
try:
112+
langfuse = get_langfuse_client(
113+
session=_session,
114+
org_id=_current_user.organization_id,
115+
project_id=_current_user.project_id,
116+
)
117+
except HTTPException as http_exc:
118+
return False, None, http_exc.detail
119+
90120
# Get original field names for question and answer
91121
golden_question = clean_headers["question"]
92122
golden_answer = clean_headers["answer"]

0 commit comments

Comments
 (0)