@@ -56,28 +56,48 @@ def upload_dataset_to_langfuse(
5656 )
5757
5858 try :
59- # Get Langfuse client
60- try :
61- langfuse = get_langfuse_client (
62- session = _session ,
63- org_id = _current_user .organization_id ,
64- project_id = _current_user .project_id ,
65- )
66- except HTTPException as http_exc :
67- return False , None , http_exc .detail
68-
69- # Parse CSV content
59+ # Parse CSV content first (fail fast on invalid CSV)
7060 csv_text = csv_content .decode ("utf-8" )
7161 csv_reader = csv .DictReader (io .StringIO (csv_text ))
7262
73- # Normalize headers: strip whitespace and lowercase for flexible matching
74- if csv_reader .fieldnames :
75- clean_headers = {
76- field .strip ().lower (): field for field in csv_reader .fieldnames
77- }
78- else :
63+ # Validate that CSV has headers
64+ if not csv_reader .fieldnames :
7965 return False , None , "CSV file has no headers"
8066
67+ # Normalize headers and detect duplicates in a single pass
68+ # Build mapping of normalized name -> list of original headers
69+ normalized_to_originals = {}
70+ for field in csv_reader .fieldnames :
71+ normalized = field .strip ().lower ()
72+ if normalized not in normalized_to_originals :
73+ normalized_to_originals [normalized ] = []
74+ normalized_to_originals [normalized ].append (field )
75+
76+ # Check for duplicate normalized headers
77+ duplicates = {
78+ norm : originals
79+ for norm , originals in normalized_to_originals .items ()
80+ if len (originals ) > 1
81+ }
82+
83+ if duplicates :
84+ # Build clear error message showing which headers conflict
85+ duplicate_groups = [
86+ f"{ originals } (all normalize to '{ norm } ')"
87+ for norm , originals in duplicates .items ()
88+ ]
89+ return (
90+ False ,
91+ None ,
92+ f"CSV contains duplicate columns after normalization: { '; ' .join (duplicate_groups )} . "
93+ "Please ensure all column names are unique (case-insensitive)." ,
94+ )
95+
96+ # Use the normalized headers for validation
97+ clean_headers = {
98+ norm : originals [0 ] for norm , originals in normalized_to_originals .items ()
99+ }
100+
81101 # Validate CSV headers using normalized names
82102 if "question" not in clean_headers or "answer" not in clean_headers :
83103 return (
@@ -87,6 +107,16 @@ def upload_dataset_to_langfuse(
87107 f"Found columns: { csv_reader .fieldnames } " ,
88108 )
89109
110+ # Get Langfuse client (after CSV validation to fail fast)
111+ try :
112+ langfuse = get_langfuse_client (
113+ session = _session ,
114+ org_id = _current_user .organization_id ,
115+ project_id = _current_user .project_id ,
116+ )
117+ except HTTPException as http_exc :
118+ return False , None , http_exc .detail
119+
90120 # Get original field names for question and answer
91121 golden_question = clean_headers ["question" ]
92122 golden_answer = clean_headers ["answer" ]
0 commit comments