In [None]:
import re
import csv

# File paths
input_path = 'HTS 2012 - Master db - Trip cleaned with broken rows fixed.csv'
clean_output_path = 'HTS 2012 - Master db - Trip cleaned with broken rows fixed cleaned again.csv'
broken_output_path = 'HTS 2012 - Master db - Trip cleaned with broken rows fixed broken again.csv'

# Step 1: Pre-clean — remove newlines inside quoted fields
with open(input_path, 'r', encoding='utf-8-sig', errors='replace') as f:
    raw_data = f.read()

def remove_newlines_in_quotes(text):
    return re.sub(r'"([^"]*?[\r\n]+[^"]*?)"', lambda m: '"' + re.sub(r'[\r\n]+', ' ', m.group(1)) + '"', text)

cleaned_data = remove_newlines_in_quotes(raw_data)

# Step 2: Re-parse the cleaned content with csv.reader
clean_count = 0
broken_count = 0
unmatched_quote_rows = []

with open(clean_output_path, 'w', newline='', encoding='utf-8') as cleanfile, \
     open(broken_output_path, 'w', newline='', encoding='utf-8') as badfile:

    clean_writer = csv.writer(cleanfile, quoting=csv.QUOTE_MINIMAL)
    bad_writer = csv.writer(badfile, quoting=csv.QUOTE_MINIMAL)

    # Use StringIO to simulate a file-like object for cleaned content
    from io import StringIO
    csv_file = StringIO(cleaned_data)
    reader = csv.reader(csv_file)

    # Read header and set expected column count
    header = next(reader)
    expected_columns = len(header)
    clean_writer.writerow(header)
    bad_writer.writerow(['__SOURCE_LINE__'] + header)

    for row_number, row in enumerate(reader, start=2):  # Start at 2 for header
        raw_line = ','.join(row)
        quote_count = raw_line.count('"')

        if len(row) == expected_columns and quote_count % 2 == 0:
            clean_writer.writerow(row)
            clean_count += 1
        else:
            bad_writer.writerow([row_number] + row)
            broken_count += 1
            if quote_count % 2 != 0:
                unmatched_quote_rows.append(row_number)

# Final reporting
print(f"✅ Clean file written: {clean_output_path}")
print(f"🧾 Broken rows saved to: {broken_output_path}")
print(f"🟢 Clean rows: {clean_count}")
print(f"🔴 Broken rows: {broken_count}")
print(f"📌 Expected column count from header: {expected_columns}")

if unmatched_quote_rows:
    print(f"⚠️ Rows with unmatched quotes: {unmatched_quote_rows[:5]}")
    if len(unmatched_quote_rows) > 5:
        print(f"...and {len(unmatched_quote_rows) - 5} more")
