In [45]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import os

In [76]:
import os
import pandas as pd
import xml.etree.ElementTree as ET

def parse(xml_file, output_file='output.csv'):
    tree = ET.parse(xml_file)  
    root = tree.getroot()

    rows = []
    for topic in root.findall('topic'):
        count = 0
        topic_id = topic.find('id').text if topic.find('id') is not None else ''
        for result in topic.find('results').findall('result'):
            result_id = result.find('id').text if result.find('id') is not None else ''
            rel = result.find('rel').text if result.find('rel') is not None else ''
            text = result.find('text').text if result.find('text') is not None else ''
            time = result.find('time').text if result.find('time') is not None else 'BLNK'
            rows.append({
                'topic_id': topic_id,
                'result_id': result_id,
                'rel': rel,
                'text': text,
                'time': time,
            })
            count+=1
        print(f"Found {count} results")

    df = pd.DataFrame(rows)
    df.to_csv(output_file, mode='a', index=False, header=not os.path.exists(output_file))

# --- Apply to every XML file in the directory ---
data_path = "data"
output_file = "output.csv"

# Clear old output file if exists (optional)
if os.path.exists(output_file):
    os.remove(output_file)

for filename in os.listdir(data_path):
    if filename.endswith(".xml"):
        filepath = os.path.join(data_path, filename)
        print(f"Parsing {filename}...")
        parse(filepath, output_file=output_file)

print("✅ All files parsed and written to output.csv")


Parsing 1904-election.xml...
Found 15 results
Parsing aviation.xml...
Found 15 results
Parsing barton.xml...
Found 15 results
Parsing boston-subway.xml...
Found 15 results
Parsing building-the-titanic.xml...
Found 14 results
Parsing cassius-clay.xml...
Found 15 results
Parsing chinese-exclusion-act.xml...
Found 12 results
Parsing combined.xml...
Found 15 results
Found 15 results
Parsing curie.xml...
Found 15 results
Parsing darwin-theory-of-evolution.xml...
Found 15 results
Parsing eiffel-tower.xml...
Found 9 results
Parsing electric-chair.xml...
Found 12 results
Parsing ellis.xml...
Found 13 results
Parsing female-pilots.xml...
Found 11 results
Parsing houdini.xml...
Found 15 results
Parsing league-of-nations.xml...
Found 13 results
Parsing lindenberg.xml...
Found 15 results
Parsing los-angeles-times-bombing.xml...
Found 15 results
Parsing mona-lisa.xml...
Found 12 results
Parsing mothers-day.xml...
Found 9 results
Parsing motorcycle-mania.xml...
Found 9 results
Parsing ouija-board.xm

In [89]:
df = pd.read_csv('output.csv')

df

Unnamed: 0,topic_id,result_id,rel,text,time
0,https://guides.loc.gov/chronicling-america-190...,\n https://www.loc.gov/resource/sn870...,1,\n REPUBLICAN CONVENTION\n G...,"June 21, 1904 :\tThe Republican National Conve..."
1,https://guides.loc.gov/chronicling-america-190...,\n https://www.loc.gov/resource/sn990...,1,\n NATIONAL\n REPUBLICAN\n ...,"June 21, 1904 :\tThe Republican National Conve..."
2,https://guides.loc.gov/chronicling-america-190...,\n https://www.loc.gov/resource/sn850...,1,\n Convention Hall Chicago Juno 23\n ...,"June 21, 1904 :\tThe Republican National Conve..."
3,https://guides.loc.gov/chronicling-america-190...,\n https://www.loc.gov/resource/sn990...,1,\n ROOSEVELT AND FAIRBANKS\n ...,"June 23, 1904 :\tTheodore Roosevelt is nominat..."
4,https://guides.loc.gov/chronicling-america-190...,\n https://www.loc.gov/resource/sn850...,1,\n ROOSEVELT WILL LEAD PARTY TO VICTO...,"June 23, 1904\t: Charles Fairbanks is named a..."
...,...,...,...,...,...
434,\n https://guides.loc.gov/chronicling-ameri...,\n https://chroniclingamerica.loc.g...,1,\n A PAGE HOW YOUNG GIRLS STUDY TH...,No timeline found
435,\n https://guides.loc.gov/chronicling-ameri...,\n https://www.loc.gov/resource/sn8...,1,\n POsition Is Everything in Life t...,No timeline found
436,\n https://guides.loc.gov/chronicling-ameri...,\n https://www.loc.gov/resource/sn8...,1,\n ¿Qué Es El Yoga? Yoga es una pal...,No timeline found
437,\n https://guides.loc.gov/chronicling-ameri...,\n https://www.loc.gov/resource/sn8...,1,\n Con la Práctica del Yoga se Pued...,No timeline found


In [90]:
to_replace = ['BLNNK', 'No timeline found', 'BLNK', 'nan']
df['time'] = df['time'].replace(to_replace, 'MISSING')
df['time'] = df['time'].fillna('MISSING')

In [91]:
import pandas as pd

# First: try to split on colon followed by optional whitespace/tab
df[['date', 'event']] = df['time'].str.extract(r'^(.*?)\s*:\s*\t?(.*)$')

# For any rows where no match was found (NaN), try splitting on tab only
mask = df['event'].isna()
df.loc[mask, ['date', 'event']] = df.loc[mask, 'time'].str.split('\t', n=1, expand=True)

# Fallback: if still nothing, put full text in event and mark date as MISSING
df['event'] = df['event'].fillna(df['time'])
df['date'] = df['date'].fillna('MISSING')

# Optional: clean up whitespace
df['date'] = df['date'].str.strip()
df['event'] = df['event'].str.strip()

df

Unnamed: 0,topic_id,result_id,rel,text,time,date,event
0,https://guides.loc.gov/chronicling-america-190...,\n https://www.loc.gov/resource/sn870...,1,\n REPUBLICAN CONVENTION\n G...,"June 21, 1904 :\tThe Republican National Conve...","June 21, 1904",The Republican National Convention begins in C...
1,https://guides.loc.gov/chronicling-america-190...,\n https://www.loc.gov/resource/sn990...,1,\n NATIONAL\n REPUBLICAN\n ...,"June 21, 1904 :\tThe Republican National Conve...","June 21, 1904",The Republican National Convention begins in C...
2,https://guides.loc.gov/chronicling-america-190...,\n https://www.loc.gov/resource/sn850...,1,\n Convention Hall Chicago Juno 23\n ...,"June 21, 1904 :\tThe Republican National Conve...","June 21, 1904",The Republican National Convention begins in C...
3,https://guides.loc.gov/chronicling-america-190...,\n https://www.loc.gov/resource/sn990...,1,\n ROOSEVELT AND FAIRBANKS\n ...,"June 23, 1904 :\tTheodore Roosevelt is nominat...","June 23, 1904",Theodore Roosevelt is nominated for the Presid...
4,https://guides.loc.gov/chronicling-america-190...,\n https://www.loc.gov/resource/sn850...,1,\n ROOSEVELT WILL LEAD PARTY TO VICTO...,"June 23, 1904\t: Charles Fairbanks is named a...","June 23, 1904",Charles Fairbanks is named as his Vice Preside...
...,...,...,...,...,...,...,...
434,\n https://guides.loc.gov/chronicling-ameri...,\n https://chroniclingamerica.loc.g...,1,\n A PAGE HOW YOUNG GIRLS STUDY TH...,MISSING,MISSING,MISSING
435,\n https://guides.loc.gov/chronicling-ameri...,\n https://www.loc.gov/resource/sn8...,1,\n POsition Is Everything in Life t...,MISSING,MISSING,MISSING
436,\n https://guides.loc.gov/chronicling-ameri...,\n https://www.loc.gov/resource/sn8...,1,\n ¿Qué Es El Yoga? Yoga es una pal...,MISSING,MISSING,MISSING
437,\n https://guides.loc.gov/chronicling-ameri...,\n https://www.loc.gov/resource/sn8...,1,\n Con la Práctica del Yoga se Pued...,MISSING,MISSING,MISSING


In [93]:
# Parse mixed-format dates into datetime objects
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Convert to a standard string format, e.g., YYYY-MM-DD
df['date'] = df['date'].dt.strftime('%Y-%m-%d')

# Optional: fill any unparseable dates with 'MISSING'
df['date'] = df['date'].fillna('MISSING')


In [95]:

df = df.drop(columns=['time'])
df.to_csv('cleaned_output.csv', index=False)
