# Parse Gemini-generated ground truths

This takes the data generated by the Gemini model and parses it into a csv file, just like the one used for training.

In [4]:
import csv
import re

# Define the input and output file paths
input_path = "outputs/ground_truths/gemini_generated_ground_truths.txt"
output_path = "outputs/ground_truths/parsed_ground_truths.csv"


# CSV columns
columns = {
    1: "Number of vehicles in accident",
    2: "Accident Type",
    3: "Person Injury?",
    4: "Need for ambulance?",
    5: "Need for firetruck?",
    6: "Need for Police?",
    7: "Types of vehicles involved",
    8: "Fire?",
    9: "Weather",
    10: "Low Res/Bad Footage?"
}
csv_headers = ["filename"] + list(columns.values())

# Read file content
with open(input_path, "r", encoding="utf-8") as f:
    content = f.read()

# Split into chunks: filename followed by response
chunks = re.split(r"Response for (\d+\.mp4):", content)[1:]
entries = list(zip(chunks[::2], chunks[1::2]))

parsed_rows = []

for filename, body in entries:
    row = {"filename": filename.strip()}
    for i in range(1, 11):
        # Regex: match lines like 1. **Label:** Value OR 1. Label: Value
        pattern = rf"{i}\.\s+(?:\*\*)?.*?(?:\*\*)?:\s*(.+)"
        match = re.search(pattern, body, re.IGNORECASE)
        row[columns[i]] = match.group(1).strip() if match else ""
        row[columns[i]] = row[columns[i]].replace("**", "")
    parsed_rows.append(row)

# Write to CSV
with open(output_path, "w", newline='', encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=csv_headers)
    writer.writeheader()
    writer.writerows(parsed_rows)

print(f"✅ Successfully parsed {len(parsed_rows)} responses to '{output_path}'")

✅ Successfully parsed 205 responses to 'outputs/ground_truths/parsed_ground_truths.csv'
