In [96]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import os

In [99]:
import os
import pandas as pd
import xml.etree.ElementTree as ET

def clean_ocr_text(text):
    if not text:
        return ''
    return ' '.join(text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').split())

def parse(xml_file, output_file='output.csv'):
    tree = ET.parse(xml_file)  
    root = tree.getroot()

    rows = []
    for topic in root.findall('topic'):
        count = 0
        topic_id = clean_ocr_text(topic.find('id').text) if topic.find('id') is not None else ''
        for result in topic.find('results').findall('result'):
            result_id = clean_ocr_text(result.find('id').text) if result.find('id') is not None else ''
            rel = result.find('rel').text if result.find('rel') is not None else ''
            text = clean_ocr_text(result.find('text').text) if result.find('text') is not None else ''
            time = clean_ocr_text(result.find('time').text) if result.find('time') is not None else 'BLNK'
            
            rows.append({
                'topic_id': topic_id,
                'result_id': result_id,
                'rel': rel,
                'text': text,
                'time': time,
            })
            count += 1
        print(f"Found {count} results")

    df = pd.DataFrame(rows)
    df.to_csv(output_file, mode='a', index=False, header=not os.path.exists(output_file))

# --- Apply to every XML file in the directory ---
data_path = "data"
output_file = "output.csv"

if os.path.exists(output_file):
    os.remove(output_file)

for filename in os.listdir(data_path):
    if filename.endswith(".xml"):
        filepath = os.path.join(data_path, filename)
        print(f"Parsing {filename}...")
        parse(filepath, output_file=output_file)

print("✅ All files parsed and written to output.csv")


Parsing 1904-election.xml...
Found 15 results
Parsing aviation.xml...
Found 15 results
Parsing barton.xml...
Found 15 results
Parsing boston-subway.xml...
Found 15 results
Parsing building-the-titanic.xml...
Found 14 results
Parsing cassius-clay.xml...
Found 15 results
Parsing chinese-exclusion-act.xml...
Found 12 results
Parsing combined.xml...
Found 15 results
Found 15 results
Parsing curie.xml...
Found 15 results
Parsing darwin-theory-of-evolution.xml...
Found 15 results
Parsing eiffel-tower.xml...
Found 9 results
Parsing electric-chair.xml...
Found 12 results
Parsing ellis.xml...
Found 13 results
Parsing female-pilots.xml...
Found 11 results
Parsing houdini.xml...
Found 15 results
Parsing league-of-nations.xml...
Found 13 results
Parsing lindenberg.xml...
Found 15 results
Parsing los-angeles-times-bombing.xml...
Found 15 results
Parsing mona-lisa.xml...
Found 12 results
Parsing mothers-day.xml...
Found 9 results
Parsing motorcycle-mania.xml...
Found 9 results
Parsing ouija-board.xm

In [129]:
df = pd.read_csv('output.csv')

df

Unnamed: 0,topic_id,result_id,rel,text,time
0,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn87093407/1904-0...,1,REPUBLICAN CONVENTION GETS DOWN TO BUSINESS O....,"June 21, 1904 : The Republican National Conven..."
1,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn99063957/1904-0...,1,NATIONAL REPUBLICAN CONVENTION Fairbanks for V...,"June 21, 1904 : The Republican National Conven..."
2,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn85052116/1904-0...,1,Convention Hall Chicago Juno 23 This the third...,"June 21, 1904 : The Republican National Conven..."
3,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn99063957/1904-0...,1,ROOSEVELT AND FAIRBANKS Republican National Co...,"June 23, 1904 : Theodore Roosevelt is nominate..."
4,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn85066387/1904-0...,1,ROOSEVELT WILL LEAD PARTY TO VICTORY; HIS RUNN...,"June 23, 1904 : Charles Fairbanks is named as ..."
...,...,...,...,...,...
434,https://guides.loc.gov/chronicling-america-yoga,https://chroniclingamerica.loc.gov/lccn/sn8506...,1,A PAGE HOW YOUNG GIRLS STUDY THE FOR HINDOO ME...,No timeline found
435,https://guides.loc.gov/chronicling-america-yoga,https://www.loc.gov/resource/sn83016209/1961-0...,1,POsition Is Everything in Life to Yogi Practit...,No timeline found
436,https://guides.loc.gov/chronicling-america-yoga,https://www.loc.gov/resource/sn86002403/1951-1...,1,¿Qué Es El Yoga? Yoga es una palabra sánscrita...,No timeline found
437,https://guides.loc.gov/chronicling-america-yoga,https://www.loc.gov/resource/sn82001257/1961-0...,1,Con la Práctica del Yoga se Puede Vivir Muchos...,No timeline found


In [130]:
to_replace = ['BLNNK', 'No timeline found', 'BLNK', 'nan']
df['time'] = df['time'].replace(to_replace, 'MISSING')
df['time'] = df['time'].fillna('MISSING')

In [120]:
df

Unnamed: 0,topic_id,result_id,rel,text,time
0,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn87093407/1904-0...,1,REPUBLICAN CONVENTION GETS DOWN TO BUSINESS O....,"June 21, 1904 : The Republican National Conven..."
1,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn99063957/1904-0...,1,NATIONAL REPUBLICAN CONVENTION Fairbanks for V...,"June 21, 1904 : The Republican National Conven..."
2,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn85052116/1904-0...,1,Convention Hall Chicago Juno 23 This the third...,"June 21, 1904 : The Republican National Conven..."
3,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn99063957/1904-0...,1,ROOSEVELT AND FAIRBANKS Republican National Co...,"June 23, 1904 : Theodore Roosevelt is nominate..."
4,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn85066387/1904-0...,1,ROOSEVELT WILL LEAD PARTY TO VICTORY; HIS RUNN...,"June 23, 1904 : Charles Fairbanks is named as ..."
...,...,...,...,...,...
434,https://guides.loc.gov/chronicling-america-yoga,https://chroniclingamerica.loc.gov/lccn/sn8506...,1,A PAGE HOW YOUNG GIRLS STUDY THE FOR HINDOO ME...,MISSING
435,https://guides.loc.gov/chronicling-america-yoga,https://www.loc.gov/resource/sn83016209/1961-0...,1,POsition Is Everything in Life to Yogi Practit...,MISSING
436,https://guides.loc.gov/chronicling-america-yoga,https://www.loc.gov/resource/sn86002403/1951-1...,1,¿Qué Es El Yoga? Yoga es una palabra sánscrita...,MISSING
437,https://guides.loc.gov/chronicling-america-yoga,https://www.loc.gov/resource/sn82001257/1961-0...,1,Con la Práctica del Yoga se Puede Vivir Muchos...,MISSING


In [131]:
pattern = r'^([A-Za-z]+\s+\d{1,2},\s*\d{4}|[A-Za-z]+\s*\d{4}|\d{4})[:\s\-]*([\s\S]*)$'

# Apply regex extract
df[['date', 'event']] = df['time'].str.extract(pattern)

In [134]:
df.to_csv('trial.csv', index=False)

In [136]:


# (Optional) Clean up leading/trailing spaces
df['date'] = df['date'].str.strip()
df['event'] = df['event'].str.strip()
df['date'] = pd.to_datetime(df['date'], errors='coerce')



In [137]:
df

Unnamed: 0,topic_id,result_id,rel,text,time,date,event
0,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn87093407/1904-0...,1,REPUBLICAN CONVENTION GETS DOWN TO BUSINESS O....,"June 21, 1904 : The Republican National Conven...",1904-06-21,The Republican National Convention begins in C...
1,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn99063957/1904-0...,1,NATIONAL REPUBLICAN CONVENTION Fairbanks for V...,"June 21, 1904 : The Republican National Conven...",1904-06-21,The Republican National Convention begins in C...
2,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn85052116/1904-0...,1,Convention Hall Chicago Juno 23 This the third...,"June 21, 1904 : The Republican National Conven...",1904-06-21,The Republican National Convention begins in C...
3,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn99063957/1904-0...,1,ROOSEVELT AND FAIRBANKS Republican National Co...,"June 23, 1904 : Theodore Roosevelt is nominate...",1904-06-23,Theodore Roosevelt is nominated for the Presid...
4,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn85066387/1904-0...,1,ROOSEVELT WILL LEAD PARTY TO VICTORY; HIS RUNN...,"June 23, 1904 : Charles Fairbanks is named as ...",1904-06-23,Charles Fairbanks is named as his Vice Preside...
...,...,...,...,...,...,...,...
434,https://guides.loc.gov/chronicling-america-yoga,https://chroniclingamerica.loc.gov/lccn/sn8506...,1,A PAGE HOW YOUNG GIRLS STUDY THE FOR HINDOO ME...,MISSING,NaT,
435,https://guides.loc.gov/chronicling-america-yoga,https://www.loc.gov/resource/sn83016209/1961-0...,1,POsition Is Everything in Life to Yogi Practit...,MISSING,NaT,
436,https://guides.loc.gov/chronicling-america-yoga,https://www.loc.gov/resource/sn86002403/1951-1...,1,¿Qué Es El Yoga? Yoga es una palabra sánscrita...,MISSING,NaT,
437,https://guides.loc.gov/chronicling-america-yoga,https://www.loc.gov/resource/sn82001257/1961-0...,1,Con la Práctica del Yoga se Puede Vivir Muchos...,MISSING,NaT,


In [117]:
len(df[df['event'] == 'MISSING'])

184

In [125]:
# Parse mixed-format dates into datetime objects
df['date'] = pd.to_datetime(df['date'], errors='coerce')



In [126]:
df

Unnamed: 0,topic_id,result_id,rel,text,time,date,event
0,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn87093407/1904-0...,1,REPUBLICAN CONVENTION GETS DOWN TO BUSINESS O....,"June 21, 1904 : The Republican National Conven...",1904-06-21,The Republican National Convention begins in C...
1,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn99063957/1904-0...,1,NATIONAL REPUBLICAN CONVENTION Fairbanks for V...,"June 21, 1904 : The Republican National Conven...",1904-06-21,The Republican National Convention begins in C...
2,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn85052116/1904-0...,1,Convention Hall Chicago Juno 23 This the third...,"June 21, 1904 : The Republican National Conven...",1904-06-21,The Republican National Convention begins in C...
3,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn99063957/1904-0...,1,ROOSEVELT AND FAIRBANKS Republican National Co...,"June 23, 1904 : Theodore Roosevelt is nominate...",1904-06-23,Theodore Roosevelt is nominated for the Presid...
4,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn85066387/1904-0...,1,ROOSEVELT WILL LEAD PARTY TO VICTORY; HIS RUNN...,"June 23, 1904 : Charles Fairbanks is named as ...",1904-06-23,Charles Fairbanks is named as his Vice Preside...
...,...,...,...,...,...,...,...
434,https://guides.loc.gov/chronicling-america-yoga,https://chroniclingamerica.loc.gov/lccn/sn8506...,1,A PAGE HOW YOUNG GIRLS STUDY THE FOR HINDOO ME...,MISSING,NaT,
435,https://guides.loc.gov/chronicling-america-yoga,https://www.loc.gov/resource/sn83016209/1961-0...,1,POsition Is Everything in Life to Yogi Practit...,MISSING,NaT,
436,https://guides.loc.gov/chronicling-america-yoga,https://www.loc.gov/resource/sn86002403/1951-1...,1,¿Qué Es El Yoga? Yoga es una palabra sánscrita...,MISSING,NaT,
437,https://guides.loc.gov/chronicling-america-yoga,https://www.loc.gov/resource/sn82001257/1961-0...,1,Con la Práctica del Yoga se Puede Vivir Muchos...,MISSING,NaT,


In [138]:

df.to_csv('cleaned_output.csv', index=False)
