In [5]:
import pandas as pd
import glob
from bs4 import BeautifulSoup
import os

In [2]:
files = glob.glob("../data/testimonies/*.html")

In [25]:
# List of span classes to track
classes_to_track = [
    'BUILDING', 'NPIP', 'COUNTRY', 'POPULATED_PLACE', 'DLF', 
    'SPATIAL_OBJ', 'REGION', 'ENV_FEATURES', 'INT_SPACE', 
    'RIVER', 'FOREST'
]

data = []
for file in files:
    with open(file, "r", encoding="utf-8") as f:
        html = f.read()
    soup = BeautifulSoup(html, 'html.parser')
    spans = soup.find_all('span')

    # Initialize counters and texts
    counters = {cls: 0 for cls in classes_to_track}
    texts = {cls: [] for cls in classes_to_track}

    for span in spans:
        # Skip spans with class 'sentence'
        if 'sentence' in span.get('class', []):
            continue

        for cls in span.get('class', []):
            if cls in classes_to_track:
                counters[cls] += 1
                texts[cls].append(span.text)

    # Only the filename, not the full path
    filename = os.path.basename(file)

    # Append the data for this file
    data.append({
        "file": filename,
        **counters,
        **{cls + "_texts": texts[cls] for cls in classes_to_track}
    })


In [26]:
df = pd.DataFrame(data)
df

Unnamed: 0,file,BUILDING,NPIP,COUNTRY,POPULATED_PLACE,DLF,SPATIAL_OBJ,REGION,ENV_FEATURES,INT_SPACE,...,NPIP_texts,COUNTRY_texts,POPULATED_PLACE_texts,DLF_texts,SPATIAL_OBJ_texts,REGION_texts,ENV_FEATURES_texts,INT_SPACE_texts,RIVER_texts,FOREST_texts
0,RG-50.549.01.0027.html,163,11,91,174,22,27,28,6,20,...,"[place, places, places, place, hell, side, pla...","[Poland, Romania, Hungary, Romania, Romania, R...","[Auschwitz, camp, city, cities, town, camp, ho...","[window, street, yard, roof, railroads, line, ...","[train, ship, Saint Louis, trains, boxcars, pi...","[area, eastern European continent, area, area,...","[ground, forest, forest, mountains, forests, b...","[apartment, apartment, bedroom apartment, apar...",[],[]
1,RG-50.233.0022.html,233,34,106,284,22,61,34,21,15,...,"[places, place, place, place, place, places, p...","[America, America, America, Crimea, Russia, Cr...","[Leningrad, Leningrad, Smela, Moscow, Starodub...","[blockade, beach, blockade, farm, farm, Greek ...","[piano, piano, piano, car, car, car, car, car,...","[Siberia, Siberia, White Russia, Siberia, Sibe...","[wood forest, wood, forest, wood, forest, fore...","[room, room, room, apartment, apartment, room,...",[],[]
2,RG-50.549.02.0072.html,251,26,78,243,39,47,54,3,14,...,"[places, places, places, place, place, place, ...","[United States, country, Palestine, Israel, co...","[community,, Washington, Olney, Philadelphia, ...","[window, bridge, Rivington Street, Bleeker Str...","[plane, plane, plane, plane, boat, aircraft, j...","[Newfoundland, New Jersey, Pennsylvania, west ...","[gazebo, lake, forest]","[sublet apartment, sublet apartment, apartment...",[],[]
3,RG-50.030.0018.html,73,33,70,112,32,23,4,46,12,...,"[place, place, inside, place, places, place, p...","[Poland, Poland, Poland, Israel, Poland, Polan...","[city, Lomza, Lomza, Lomza, Lomza, Lakhva, ghe...","[border, border, border, border, bridge, cemet...","[bed, beds, train, train, airplanes, table, ov...","[Russian side, area, Russian side, German side]","[water, woods, trees, woods, woods, woods, woo...","[floor, floor, room, room, apartment, room, fl...",[],[]
4,RG-50.030.0448.html,189,42,58,123,98,30,34,7,46,...,"[place, places, outside, place, place, outside...","[Germany, Germany, Poland, Poland, Poland, cou...","[Hindenberg, Hindenberg, Hindenberg, city, cit...","[play ground, streets, windows, doors, window,...","[train, wagons, wagon, bed, bed, beds, bed, be...","[Eastern Germany, Eastern Poland, area, Aryan ...","[ground, forests, ground, ground, ground, wood...","[apartments, celler, celler, floor, space, flo...",[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
882,RG-50.106.0196.html,106,3,85,58,31,24,5,8,22,...,"[outside, place, inside]","[Germany, Germany, Germany, Germany, Poland, c...","[Ludwigshafen, Ludwigshafen, industrial city, ...","[street, park, park, roof, street, street, str...","[piano, table, piano, Steinway, bed, train, tr...","[area, area, North Carolina, North Carolina, N...","[Alps, Atlantic, Rheingold, Rheingold, river R...","[kitchen, floor, apartment, cellars, cellars, ...",[],[]
883,RG-50.030.0349.html,127,32,111,166,57,89,38,15,8,...,"[place, place, place, place, place, place, pla...","[Romania, Romania, America, Romania, France, G...","[Paris, Paris, Paris, Paris, Jewish community,...","[Maginot line, Maginot line, road, road, garde...","[table, tanks, airplane, cars, motorcycles, mo...","[area, Eastern Europe, Brittany, Brittany, are...","[forest, forest, forest, ground, mountain, mou...","[balcony, apartment, room, compartments, room,...",[],[]
884,RG-50.106.0179.html,105,24,93,134,67,22,4,6,42,...,"[place, place, place, place, place, outside, s...","[Russia, Russia, Latvia, Latvia, Russia, Latvi...","[Riga, city, Livani, Livani, Subata, town, Sub...","[garden, ranch, street, Fence, wall, wall, iro...","[train, train, train, train, ship, train, trai...","[Baltic states, Siberia, Siberia, Siberia]","[apple tree, tree, Viesite, woods, wood, wood]","[apartment, apartments, apartment, apartment, ...",[],[]
885,RG-50.030.0575.html,75,24,58,117,28,69,0,7,25,...,"[place, place, place, place, place, place, pla...","[Poland, Poland, Israel, Israel, Israel, Pales...","[Warsaw, Warsaw, Paris, Warsaw, Warsaw, ghetto...","[street, park, park, doors, street, hole, door...","[pram, baby carriage, baby pram, train, trains...",[],"[forest, ground, ground, river, lake, hills, m...","[floor, apartment, room, cupboard, cupboard, c...",[],[]


In [27]:
span_class

['BUILDING',
 'NPIP',
 'COUNTRY',
 'POPULATED_PLACE',
 'DLF',
 'SPATIAL_OBJ',
 'REGION',
 'ENV_FEATURES',
 'INT_SPACE',
 'RIVER',
 'FOREST']

In [29]:
df.to_parquet("../data/data_counts.parquet")

In [32]:
df2 = pd.read_csv("../data/testimony_metadata.csv")
df2.keys()

Index(['RG Number', 'PDF URL', 'USHMM URL', 'First Name', 'Middle Name',
       'Last Name', 'Birth Name', 'Gender', 'Birth Date', 'Birth Year',
       'Place of Birth', 'Country', 'Experience Group',
       'Ghetto(s) Encyclopedia', 'Ghetto', 'Camp(s) Encyclopedia', 'Camp',
       'Non-SS Camp  ', 'Region', 'Needs Research', 'Data Entry', 'Accession',
       'Notes:', 'Revisit'],
      dtype='object')

In [35]:
df2

Unnamed: 0,RG Number,PDF URL,USHMM URL,First Name,Middle Name,Last Name,Birth Name,Gender,Birth Date,Birth Year,...,Ghetto,Camp(s) Encyclopedia,Camp,Non-SS Camp,Region,Needs Research,Data Entry,Accession,Notes:,Revisit
0,RG-50.549.02.0033,https://collections.ushmm.org/oh_findingaids/R...,https://collections.ushmm.org/search/catalog/i...,Hetty,d'Ancona de,Leeuwe,Hetty D'Ancona,F,1930-05-01,1930.0,...,,,,,,,CL,1999.A.0293,,
1,RG-50.549.02.0072,https://collections.ushmm.org/oh_findingaids/R...,https://collections.ushmm.org/search/catalog/i...,Emanuel,,Mandel,,M,,1936.0,...,,,,,,checked,GG,2003.205,Follow-up interview,
2,RG-50.549.02.0035,https://collections.ushmm.org/oh_findingaids/R...,https://collections.ushmm.org/search/catalog/i...,Judith,,Meisel,,F,,1929.0,...,Kaunas,,,,,checked,GG,1999.A.0024,This is a follow-up interview to one already d...,checked
3,RG-50.471.0015,https://collections.ushmm.org/oh_findingaids/R...,https://collections.ushmm.org/search/catalog/i...,Esther,,Lurie,,F,,,...,,,,,,,CL,1998.A.0119.15,,
4,RG-50.030.0585,https://collections.ushmm.org/oh_findingaids/R...,https://collections.ushmm.org/search/catalog/i...,Eugene,,Miller,,M,1923-10-16,1923.0,...,Lodz,"Auschwitz,Dachau",,,,checked,GG,2010.249,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
972,RG-50.549.02.0073,https://collections.ushmm.org/oh_findingaids/R...,https://collections.ushmm.org/search/catalog/i...,Flory,,Jagoda,,F,1923-12-21,1923.0,...,,,,,,,GG,2004.48,Follow-up,checked
973,RG-50.030.0137,https://collections.ushmm.org/oh_findingaids/R...,https://collections.ushmm.org/search/catalog/i...,Cornelius,,Loen,,M,1922-05-02,1922.0,...,,,,,,,CL,1990.437.1,,
974,RG-50.030.0058,https://collections.ushmm.org/oh_findingaids/R...,https://collections.ushmm.org/search/catalog/i...,Isaac,,Danon,,M,,1929.0,...,,,,,,,GG,,,
975,RG-50.549.02.0078,https://collections.ushmm.org/oh_findingaids/R...,https://collections.ushmm.org/search/catalog/i...,Lucie,,Rosenberg,,F,,1921.0,...,,,,,,checked,CL,2004.214,"Not a survivor, volunteered for the museum?",


In [37]:
# Remove the '.html' extension from the 'file' column in df
df['file'] = df['file'].str.replace('.html', '', regex=False)

# Perform the merge
df3 = pd.merge(df, df2, left_on='file', right_on='RG Number', how='inner')
df3

Unnamed: 0,file,BUILDING,NPIP,COUNTRY,POPULATED_PLACE,DLF,SPATIAL_OBJ,REGION,ENV_FEATURES,INT_SPACE,...,Ghetto,Camp(s) Encyclopedia,Camp,Non-SS Camp,Region,Needs Research,Data Entry,Accession,Notes:,Revisit
0,RG-50.549.01.0027,163,11,91,174,22,27,28,6,20,...,,Auschwitz,,,West,,CL,,,
1,RG-50.233.0022,233,34,106,284,22,61,34,21,15,...,,,,,,,GG,1992.A.0125.22,,
2,RG-50.549.02.0072,251,26,78,243,39,47,54,3,14,...,,,,,,checked,GG,2003.205,Follow-up interview,
3,RG-50.030.0018,73,33,70,112,32,23,4,46,12,...,,,,,,,GG,,,
4,RG-50.030.0448,189,42,58,123,98,30,34,7,46,...,,,,,East,,CL,1996.A.0528,Hid in farms after escaping Buczacz,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
882,RG-50.106.0196,106,3,85,58,31,24,5,8,22,...,,,,,,,GG,2012.32,,
883,RG-50.030.0349,127,32,111,166,57,89,38,15,8,...,,,,,,,CL,,,
884,RG-50.106.0179,105,24,93,134,67,22,4,6,42,...,,,,,,,CL,2010.23,"being forced to live in the ghetto in Riga, La...",checked
885,RG-50.030.0575,75,24,58,117,28,69,0,7,25,...,,"Bergen-Belsen,Buchenwald",,,West,,CL,2010.143,,


In [38]:
df3.keys()

Index(['file', 'BUILDING', 'NPIP', 'COUNTRY', 'POPULATED_PLACE', 'DLF',
       'SPATIAL_OBJ', 'REGION', 'ENV_FEATURES', 'INT_SPACE', 'RIVER', 'FOREST',
       'BUILDING_texts', 'NPIP_texts', 'COUNTRY_texts',
       'POPULATED_PLACE_texts', 'DLF_texts', 'SPATIAL_OBJ_texts',
       'REGION_texts', 'ENV_FEATURES_texts', 'INT_SPACE_texts', 'RIVER_texts',
       'FOREST_texts', 'RG Number', 'PDF URL', 'USHMM URL', 'First Name',
       'Middle Name', 'Last Name', 'Birth Name', 'Gender', 'Birth Date',
       'Birth Year', 'Place of Birth', 'Country', 'Experience Group',
       'Ghetto(s) Encyclopedia', 'Ghetto', 'Camp(s) Encyclopedia', 'Camp',
       'Non-SS Camp  ', 'Region', 'Needs Research', 'Data Entry', 'Accession',
       'Notes:', 'Revisit'],
      dtype='object')

In [39]:
df3.to_parquet("../data/data_counts.parquet")