# Set Up

In [67]:
import polars as pl
import requests
from datetime import datetime

In [36]:
# 1. Show more rows (default is ~25)
pl.Config.set_tbl_rows(100) 

# 2. Show the full text in columns without cutting it off with "..."
pl.Config.set_fmt_str_lengths(1000)

# 3. Prevent the entire table from being width-truncated
pl.Config.set_tbl_width_chars(1000)

polars.config.Config

In [37]:
!ls -l

total 1056
-rw-r--r--@ 1 alex  staff      99 28 Jan 19:10 main.py
-rw-r--r--@ 1 alex  staff  251465 28 Jan 19:15 page_content.html
-rw-r--r--@ 1 alex  staff     263 28 Jan 19:22 pyproject.toml
-rw-r--r--@ 1 alex  staff     704 28 Jan 19:34 README.md
-rw-r--r--@ 1 alex  staff    5995 28 Jan 19:35 scrape_reports.py
-rw-r--r--@ 1 alex  staff  136576 28 Jan 19:22 uv.lock
-rw-r--r--@ 1 alex  staff   30703 28 Jan 19:38 yale_reports.csv
-rw-r--r--@ 1 alex  staff   53311 28 Jan 19:37 YHLR Exploration.ipynb


In [38]:
df = pl.read_csv("yale_reports.csv")

# Exploration

In [39]:
df.sample(4)

url,title,description,date
str,str,str,str
"""https://files-profile.medicine.yale.edu/documents/b9c14991-6b22-492e-9e16-f903d25d9b49""","""Human Security Emergency: Day Two of RSF Control: Mass Killings Continue in El-Fasher""","""Citation | Raymond, Nathaniel A., Howarth, Caitlin et al. “HUMAN SECURITY EMERGENCY Day Two of RSF Control: Mass Killings Continue in El-Fasher.” 28 October 2025. Humanitarian Research Lab at Yale School of Public Health: New Haven.""","""October 28, 2025"""
"""https://files-profile.medicine.yale.edu/documents/e6294def-3f80-4d71-9cc7-91f6af70a523""","""Ukraine's Stolen Children: Inside Russia's Network of Re-education and Militarization""","""Citation | Farrenkopf, Paige, Caitlin N. Howarth, and Nathaniel A. Raymond et al., “Ukraine’s Stolen Children: Inside Russia’s Network of Re-Education and Militarization.” 16 September 2025. Humanitarian Research Lab at Yale School of Public Health: New Haven.""","""September 16, 2025"""
"""https://files-profile.medicine.yale.edu/documents/7ccc58f2-bbda-4066-b048-8bab64a616fd""","""El-Fasher: Recent Hospital Bombardment and Current Areas of Control""","""Citation | Howarth, Caitlin N., Kaveh Khoshnood, Nathaniel A. Raymond et al. “El-Fasher: Recent Hospital Bombardment and Current Areas of Control,” 29 August 2024. Humanitarian Research Lab at Yale School of Public Health:""","""August 29, 2024"""
"""https://files-profile.medicine.yale.edu/documents/1f9a90aa-6b8f-455f-a145-e368dfe682f8""","""Special Report: Zamzam IDP Camp Attacked: Confirmation of Munition Impacts Between 1-3 December 2024""","""Citation | Caitlin N. Howarth, Kaveh Khoshnood, Nathaniel A. Raymond et al. “Zamzam IDP Camp Attacked: Confirmation of Munition Impacts Between 1-3 December 2024” 03 December 2024. Humanitarian Research Lab at Yale School of Public Health: New Haven.""","""December 3, 2024"""


In [40]:
df.schema

Schema([('url', String),
        ('title', String),
        ('description', String),
        ('date', String)])

In [41]:
df.glimpse()

Rows: 71
Columns: 4
$ url         <str> 'https://files-profile.medicine.yale.edu/documents/8dce0cc5-35fb-4078-b2aa-748e51cc8e9d', 'https://files-profile.medicine.yale.edu/documents/001a93f9-1c99-4437-a023-bac1294d3d0e', 'https://files-profile.medicine.yale.edu/documents/24ec5a80-6cd6-4145-addf-4ec88149a388', 'https://files-profile.medicine.yale.edu/documents/e657bba4-11ba-475b-a923-9e5e4e540901', 'https://files-profile.medicine.yale.edu/documents/7db4031a-066c-4833-87c9-eaa3897219b2', 'https://files-profile.medicine.yale.edu/documents/8163d430-7a33-458e-be60-f0022a6c31d2', 'https://files-profile.medicine.yale.edu/documents/bc5a6e36-8da3-452a-8ad9-18eac96ec064', 'https://files-profile.medicine.yale.edu/documents/b9c14991-6b22-492e-9e16-f903d25d9b49', 'https://files-profile.medicine.yale.edu/documents/876b4afc-e1da-495b-ac32-b5098699a371', 'https://files-profile.medicine.yale.edu/documents/e9f76d25-5620-4ff1-ac0d-dc361789a9b1'
$ title       <str> 'Confirmed Civilian Displacement & Recent

In [42]:
df.describe()

statistic,url,title,description,date
str,str,str,str,str
"""count""","""71""","""71""","""71""","""71"""
"""null_count""","""0""","""0""","""0""","""0"""
"""mean""",,,,
"""std""",,,,
"""min""","""https://files-profile.medicine.yale.edu/documents/001a93f9-1c99-4437-a023-bac1294d3d0e""","""13 Long-Range Suicide Drones and Launch Platforms near Nyala Airport, May 2025""","""Caitlin N. Howarth, Kaveh Khoshnood, Nathaniel A. Raymond et al. “Confirmation of High-Tempo Aerial Bombardment in El-Fasher, 1-6 October 2024.” 07 October 2024. Humanitarian Research Lab at Yale School of Public Health: New Haven.""","""14 May 2024"""
"""25%""",,,,
"""50%""",,,,
"""75%""",,,,
"""max""","""https://files-profile.medicine.yale.edu/documents/fa53bffe-06a7-4451-9276-afb784e94706""","""Widespread Damage to Healthcare Facilities in Khartoum State, Sudan""","""Citation | Zena Ahmed, Faisal Ahmed Alnoor, Abdulazim Awadalla, Caroline Crystal, Anmar Homeida, Caitlin N. Howarth, Kaveh Khoshnood, Olivia Mooney, Elbara M. Noureldin, Danielle N. Poole, Nathaniel A. Raymond, Antonia Zawalski et al. Yale Humanitarian Research Lab and Sudanese American Physician's Association. “Widespread damage to healthcare facilities in Khartoum State, Sudan 10 December 2024"". Humanitarian Research Lab at Yale School of Public Health: New Haven.""","""September 30, 2024"""


In [45]:
df

url,title,description,date
str,str,str,str
"""https://files-profile.medicine.yale.edu/documents/8dce0cc5-35fb-4078-b2aa-748e51cc8e9d""","""Confirmed Civilian Displacement & Recent Bombardment in El Obeid""","""Citation | Andersen, Daniel, Rebecca Chausse, Caitlin N. Howarth, Omer Ismail, Olivia Mooney, Nathaniel A. Raymond et al. “Confirmation of Civilian Displacement and Recent Bombardment in El Obeid.” 16 January 2026. Situation Report, No. 69. Humanitarian Research Lab at Yale School of Public Health: New Haven.""","""January 16, 2026"""
"""https://files-profile.medicine.yale.edu/documents/001a93f9-1c99-4437-a023-bac1294d3d0e""","""RSF Systematic Mass Killings and Body Disposal in El-Fasher, 26 October – 28 November 2025""","""Citation | Andersen, Daniel, Rebecca Chausse, Caitlin N. Howarth, Omer Ismail, Olivia Mooney, Danielle N. Poole, Nathaniel A. Raymond et al. “RSF Systematic Mass Killings and Body Disposal in El-Fasher, 26 October – 28 November 2025” 16 December 2025. Humanitarian Research Lab at Yale School of Public Health: New Haven.""","""December 16, 2025"""
"""https://files-profile.medicine.yale.edu/documents/24ec5a80-6cd6-4145-addf-4ec88149a388""","""Atrocity Alert: Empty Markets, Ongoing Body Disposal in El-Fasher""","""Citation | Andersen, Daniel, Caitlin Howarth, Olivia Mooney, and Nathaniel A. Raymond et al. “ATROCITY ALERT: Empty Markets, Ongoing Body Disposal in El-Fasher.” 21 November 2025. Humanitarian Research Lab at Yale School of Public Health: New Haven.""","""November 21, 2025"""
"""https://files-profile.medicine.yale.edu/documents/e657bba4-11ba-475b-a923-9e5e4e540901""","""Atrocity Alert: Evidence of Ongoing Body Disposal in El-Fasher""","""Citation | Raymond, Nathaniel A. and Caitlin Howarth et al. “Evidence of Ongoing Body Disposal in El-Fasher.” 14 November 2025. Humanitarian Research Lab at Yale School of Public Health: New Haven.""","""November 14, 2025"""
"""https://files-profile.medicine.yale.edu/documents/7db4031a-066c-4833-87c9-eaa3897219b2""","""Atrocity Alert: RSF Closed Berm Exit and Ongoing Body Disposal Operations in El-Fasher""","""Citation | Raymond, Nathaniel A. and Caitlin Howarth et al. “ATROCITY ALERT RSF Closed Berm Exit and Ongoing Body Disposal Operations in El-Fasher” 6 November 2025. Humanitarian Research Lab at Yale School of Public Health: New Haven""","""November 6, 2025"""
"""https://files-profile.medicine.yale.edu/documents/8163d430-7a33-458e-be60-f0022a6c31d2""","""Atrocity Alert: Body Disposal and Mass Killing in RSF-Controlled El-Fasher""","""Citation | Raymond, Nathaniel A. and Caitlin Howarth et al. “ATROCITY ALERT Body Disposal and Mass Killing in RSF-Controlled El-Fasher,” 04 November 2025. Humanitarian Research Lab at Yale School of Public Health: New Haven.""","""November 4, 2025"""
"""https://files-profile.medicine.yale.edu/documents/bc5a6e36-8da3-452a-8ad9-18eac96ec064""","""Atrocity Alert: RSF Mass Killings Persist in El-Fasher""","""Citation | Raymond, Nathaniel A. and Caitlin Howarth et al. “Atrocity Alert: RSF Mass Killings Persist in El-Fasher.” 31 October 2025. Humanitarian Research Lab at Yale School of Public Health: New Haven.""","""October 31, 2025"""
"""https://files-profile.medicine.yale.edu/documents/b9c14991-6b22-492e-9e16-f903d25d9b49""","""Human Security Emergency: Day Two of RSF Control: Mass Killings Continue in El-Fasher""","""Citation | Raymond, Nathaniel A., Howarth, Caitlin et al. “HUMAN SECURITY EMERGENCY Day Two of RSF Control: Mass Killings Continue in El-Fasher.” 28 October 2025. Humanitarian Research Lab at Yale School of Public Health: New Haven.""","""October 28, 2025"""
"""https://files-profile.medicine.yale.edu/documents/876b4afc-e1da-495b-ac32-b5098699a371""","""Human Security Emergency: El-Fasher Falls to RSF: Evidence of Mass Killing""","""Citation | Raymond, Nathaniel A., Caitlin Howarth, et al. “HUMAN SECURITY EMERGENCY El-Fasher Falls to RSF: Evidence of Mass Killing,” 27 October 2025. Humanitarian Research Lab at Yale School of Public Health: New Haven.""","""October 27, 2025"""
"""https://files-profile.medicine.yale.edu/documents/e9f76d25-5620-4ff1-ac0d-dc361789a9b1""","""Special Report: RSF Intentionally Targeting Civilian Shelters in El-Fasher""","""Citation | Raymond, Nathaniel A. and Caitlin Howarth et al.[MO1] “SPECIAL REPORT: RSF Intentionally Targeting Civilian Shelters in El-Fasher” 14 October 2025. Humanitarian Research Lab at Yale School of Public Health: New Haven.""","""October 14, 2025"""


# Data Cleaning

- Remove non-Sudan titles
- Verify PDF URLs are all good
- Check date formats are valid and write in ISO format

In [47]:
non_sudan_titles = {"Intentional, Systematic, & Widespread: Russia's Program of Coerced Adoption and Fostering of Ukraine's Children", "Ukraine's Stolen Children: Inside Russia's Network of Re-education and Militarization"}

In [48]:
only_sudanese_titles = df.filter(
    ~pl.col("title").is_in(non_sudan_titles)
)

In [50]:
only_sudanese_titles.glimpse()

Rows: 69
Columns: 4
$ url         <str> 'https://files-profile.medicine.yale.edu/documents/8dce0cc5-35fb-4078-b2aa-748e51cc8e9d', 'https://files-profile.medicine.yale.edu/documents/001a93f9-1c99-4437-a023-bac1294d3d0e', 'https://files-profile.medicine.yale.edu/documents/24ec5a80-6cd6-4145-addf-4ec88149a388', 'https://files-profile.medicine.yale.edu/documents/e657bba4-11ba-475b-a923-9e5e4e540901', 'https://files-profile.medicine.yale.edu/documents/7db4031a-066c-4833-87c9-eaa3897219b2', 'https://files-profile.medicine.yale.edu/documents/8163d430-7a33-458e-be60-f0022a6c31d2', 'https://files-profile.medicine.yale.edu/documents/bc5a6e36-8da3-452a-8ad9-18eac96ec064', 'https://files-profile.medicine.yale.edu/documents/b9c14991-6b22-492e-9e16-f903d25d9b49', 'https://files-profile.medicine.yale.edu/documents/876b4afc-e1da-495b-ac32-b5098699a371', 'https://files-profile.medicine.yale.edu/documents/e9f76d25-5620-4ff1-ac0d-dc361789a9b1'
$ title       <str> 'Confirmed Civilian Displacement & Recent

In [66]:
bad_urls = set()
for row in only_sudanese_titles.iter_rows(named=True):
    url = row["url"]
    response = requests.get(url)
    if response.status_code != 200:
        bad_urls.add(url)

In [69]:
bad_urls

set()

In [73]:
bad_dates = set()
for row in only_sudanese_titles.iter_rows(named=True):
    date_to_check = row["date"]
    try:
        datetime.strptime(date_to_check,"%B %-d, %Y")
    except Exception:
        bad_dates.add(date_to_check)
bad_dates

{'14 May 2024', '20 May 2024', '23 May 2024'}

In [83]:
only_sudanese_titles["date"].sample(10)

date
str
"""January 31, 2025"""
"""January 16, 2025"""
"""April 29, 2024"""
"""October 14, 2025"""
"""June 28, 2024"""
"""December 16, 2025"""
"""May 2, 2024"""
"""23 May 2024"""
"""May 8, 2025"""
"""August 18, 2025"""


In [92]:
only_sudanese_titles.describe()

statistic,url,title,description,date
str,str,str,str,str
"""count""","""69""","""69""","""69""","""69"""
"""null_count""","""0""","""0""","""0""","""0"""
"""mean""",,,,
"""std""",,,,
"""min""","""https://files-profile.medicine.yale.edu/documents/001a93f9-1c99-4437-a023-bac1294d3d0e""","""13 Long-Range Suicide Drones and Launch Platforms near Nyala Airport, May 2025""","""Caitlin N. Howarth, Kaveh Khoshnood, Nathaniel A. Raymond et al. “Confirmation of High-Tempo Aerial Bombardment in El-Fasher, 1-6 October 2024.” 07 October 2024. Humanitarian Research Lab at Yale School of Public Health: New Haven.""","""14 May 2024"""
"""25%""",,,,
"""50%""",,,,
"""75%""",,,,
"""max""","""https://files-profile.medicine.yale.edu/documents/fa53bffe-06a7-4451-9276-afb784e94706""","""Widespread Damage to Healthcare Facilities in Khartoum State, Sudan""","""Citation | Zena Ahmed, Faisal Ahmed Alnoor, Abdulazim Awadalla, Caroline Crystal, Anmar Homeida, Caitlin N. Howarth, Kaveh Khoshnood, Olivia Mooney, Elbara M. Noureldin, Danielle N. Poole, Nathaniel A. Raymond, Antonia Zawalski et al. Yale Humanitarian Research Lab and Sudanese American Physician's Association. “Widespread damage to healthcare facilities in Khartoum State, Sudan 10 December 2024"". Humanitarian Research Lab at Yale School of Public Health: New Haven.""","""September 30, 2024"""


In [99]:
def cleanup_dates(row):
    raw_date = row["date"]
    try:
        parsed_date = datetime.strptime(raw_date,"%B %-d, %Y")
    except Exception:
        parsed_date = datetime.strptime(raw_date,"%d %B %Y")
    return parsed_date.isoformat()

clean_df = only_sudanese_titles.with_columns(
    iso_format_date=pl.struct(["date"]).map_elements(cleanup_dates, return_dtype=pl.String)
)

In [101]:
clean_df["iso_format_date"].sample(15)

iso_format_date
str
"""2026-01-16T00:00:00"""
"""2025-04-25T00:00:00"""
"""2025-08-28T00:00:00"""
"""2025-04-14T00:00:00"""
"""2024-06-05T00:00:00"""
"""2025-04-22T00:00:00"""
"""2025-02-13T00:00:00"""
"""2024-09-30T00:00:00"""
"""2024-05-02T00:00:00"""
"""2024-12-13T00:00:00"""


In [103]:
clean_df.write_csv("clean_yhlr_reports.csv")

In [104]:
!ls

clean_yhlr_reports.csv pyproject.toml         uv.lock
main.py                README.md              yale_reports.csv
page_content.html      scrape_reports.py      YHLR Exploration.ipynb


In [105]:
!head clean_yhlr_reports.csv

url,title,description,date,iso_format_date
https://files-profile.medicine.yale.edu/documents/8dce0cc5-35fb-4078-b2aa-748e51cc8e9d,Confirmed Civilian Displacement & Recent Bombardment in El Obeid,"Citation | Andersen, Daniel, Rebecca Chausse, Caitlin N. Howarth, Omer Ismail, Olivia Mooney, Nathaniel A. Raymond et al. “Confirmation of Civilian Displacement and Recent Bombardment in El Obeid.” 16 January 2026. Situation Report, No. 69. Humanitarian Research Lab at Yale School of Public Health: New Haven.","January 16, 2026",2026-01-16T00:00:00
https://files-profile.medicine.yale.edu/documents/001a93f9-1c99-4437-a023-bac1294d3d0e,"RSF Systematic Mass Killings and Body Disposal in El-Fasher, 26 October – 28 November 2025","Citation | Andersen, Daniel, Rebecca Chausse, Caitlin N. Howarth, Omer Ismail, Olivia Mooney, Danielle N. Poole, Nathaniel A. Raymond et al. “RSF Systematic Mass Killings and Body Disposal in El-Fasher, 26 October – 28 November 2025” 16 December 2025. Humanitarian Researc