In [1]:
import os
from datasets import Dataset
from huggingface_hub import HfApi

# Define the path to the cleaned HTML files
data_dir = "../data/cleaned_htmls"

# Function to read HTML files and extract filenames
def load_data():
    data = []
    for filename in os.listdir(data_dir):
        if filename.endswith('.html'):
            with open(os.path.join(data_dir, filename), 'r', encoding='utf-8') as f:
                html_content = f.read()
            rg = os.path.splitext(filename)[0]  # Remove extension
            data.append({"rg": rg, "html": html_content})
    return data

# Load the data
dataset = Dataset.from_list(load_data())

# Print dataset info
print(dataset)

# Push to Hugging Face Hub
api = HfApi()
api.create_repo(repo_id="placingholocaust/testimonies-1k", exist_ok=True)
dataset.push_to_hub("placingholocaust/testimonies-1k")

print("Dataset uploaded successfully to placingholocaust/testimonies-1k")


Dataset({
    features: ['rg', 'html'],
    num_rows: 949
})


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

Dataset uploaded successfully to placingholocaust/testimonies-1k


In [None]:
# Create README.md content

Holocaust Survivor Testimonies Dataset

This dataset contains a collection of Holocaust survivor testimonies from the United States Holocaust Memorial Museum (USHMM). The testimonies are presented in an unconventional HTML format, which is temporary and will be further processed in future iterations.

## Dataset Structure

Each item in the dataset consists of two fields:

1. `rg`: A unique identifier for each testimony, corresponding to the original filename without the extension.
2. `html`: The content of the testimony in HTML format.

## HTML Structure

The HTML content is structured as follows:

- The document starts with a standard HTML5 doctype and structure.
- The main content is enclosed in `<body>` tags.
- Each dialogue element (question or answer) is wrapped in `<dialogue>` tags with a class attribute indicating whether it's a "Question" or "Answer".
- The text content is further wrapped in `<p>` tags within the `<dialogue>` tags.

Example:
```

# Holocaust Survivor Testimonies Dataset

This dataset contains a collection of Holocaust survivor testimonies from the United States Holocaust Memorial Museum (USHMM). The testimonies are presented in an unconventional HTML format, which is temporary and will be further processed in future iterations.

## Dataset Structure

Each item in the dataset consists of two fields:

1. `rg`: A unique identifier for each testimony, corresponding to the original filename without the extension.
2. `html`: The content of the testimony in HTML format.

## HTML Structure

The HTML content is structured as follows:

- The document starts with a standard HTML5 doctype and structure.
- The main content is enclosed in `<body>` tags.
- Each dialogue element (question or answer) is wrapped in `<dialogue>` tags with a class attribute indicating whether it's a "Question" or "Answer".
- The text content is further wrapped in `<p>` tags within the `<dialogue>` tags.
Example:
```html
<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<title>Document</title>
</head>
<body><dialogue class=""><p><p>DAVID KOCHALSKI July 28, 1994</p></p></dialogue>
<dialogue class=""><p></p></dialogue><dialogue class="Question"><p><p>Q: I need you to start off by telling me your name, place of birth, and date of birth?</p></p></dialogue>
<dialogue class=""><p></p></dialogue><dialogue class="Answer"><p><p>A: My name David Kochalski. I was born in a small town called , and I was born May 5, 1928.</p></p></dialogue>
<dialogue class=""><p></p></dialogue><dialogue class="Question"><p><p>Q: Tell me a little bit about your family life before the war?</p></p></dialogue>
```

- There are `<span>` elements with class "page-break" used to identify page breaks within the text. These elements include a "number" attribute indicating the page number.

 Example:
```html
 <dialogue class="Answer">
   <p>
     <p>A: Yes, I did. I felt it, maybe not personally, but I knew of a lot of incidents whereby either they were small little -- I would call it -- we were separated, in other words, but hardly got together, and there were incidents. Incidents, not pleasant incidents, because we were <span class="page-break" number="0"></span> called in from the house, people regardless of how religious we were, did not believe that we were really religious people.</p>
   </p>
 </dialogue>
```

## Example of Finding all the Questions:

```python
from datasets import load_dataset
from bs4 import BeautifulSoup

# Load the dataset from Hugging Face
dataset = load_dataset("placingholocaust/testimonies-1k", split="train").take(1)

# Function to extract questions from HTML content
def extract_questions(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    questions = soup.find_all('dialogue', class_='Question')
    return [q.get_text(strip=True) for q in questions]

# Iterate through the dataset and print questions
for item in dataset:
    rg = item['rg']
    html_content = item['html']
    
    questions = extract_questions(html_content)
    
    print(f"Questions from testimony {rg}:")
    for i, question in enumerate(questions, 1):
        print(f"{i}. {question}")
    print("\n")
```

# Python code to download and parse the dataset

from datasets import load_dataset
from bs4 import BeautifulSoup

# Load the dataset from Hugging Face
dataset = load_dataset("placingholocaust/testimonies-1k", split="train")

# Function to extract questions from HTML content
def extract_questions(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    questions = soup.find_all('dialogue', class_='Question')
    return [q.get_text(strip=True) for q in questions]

# Iterate through the dataset and print questions
for item in dataset['train']:
    rg = item['rg']
    html_content = item['html']
    
    questions = extract_questions(html_content)
    
    print(f"Questions from testimony {rg}:")
    for i, question in enumerate(questions, 1):
        print(f"{i}. {question}")
    print("\n")

# Note: Replace "your_username" in the dataset loading line with the actual username or organization name under which the dataset is hosted on Hugging Face.


In [5]:
# Python code to download and parse the dataset

from datasets import load_dataset
from bs4 import BeautifulSoup

# Load the dataset from Hugging Face
dataset = load_dataset("placingholocaust/testimonies-1k", split="train").take(1)

# Function to extract questions from HTML content
def extract_questions(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    questions = soup.find_all('dialogue', class_='Question')
    return [q.get_text(strip=True) for q in questions]

# Iterate through the dataset and print questions
for item in dataset:
    rg = item['rg']
    html_content = item['html']
    
    questions = extract_questions(html_content)
    
    print(f"Questions from testimony {rg}:")
    for i, question in enumerate(questions, 1):
        print(f"{i}. {question}")
    print("\n")

# Note: Replace "your_username" in the dataset loading line with the actual username or organization name under which the dataset is hosted on Hugging Face.


Questions from testimony RG-50.030.0469_trs_en_cleaned:
1. Q: Good morning, Alice.
2. Q: Welcome, welcome to Washington, and Virginia.
3. Q: It's wonderful to have you here, finally, after all this time.
4. Q: Yes. Alice, tell me when you were born. What was the date of -- your date of birth.
5. Q: And where were you born?
6. Q: In --
7. Q: Slovakia. And what was your name when you were born?
8. Q: R-o-t-h. And it's now?
9. Q: Jakubovic.
10. Q: Now I -- I'd like to get some idea about your family life, about your mother and your father and your brother, and what life was like. So tell me -- tell me a little bit about your father. What was he doing?
11. Q: Yes.
12. Q: Vera Isen.
13. Q: But you remembered this --
14. Q: -- [indecipherable] Isen.
15. Q: Right.
16. Q: Nine years older than you.
17. Q: A what?
18. Q: Oh, glass.
19. Q: Well let -- let's -- let's go back just a little bit. Tell me what your father's name was.
20. Q: And your mother's name?
21. Q: And your brother?
22. Q: Niko