In [None]:
import requests
import os
import time
from tqdm import tqdm
from datetime import datetime, timedelta
from pdf2image import convert_from_path
from PIL import Image
import pytesseract

In [None]:
def download_pdf(list_of_urls, output_dir, formatted_date):
    success_status = False
    for url in list_of_urls:
        response = requests.get(url)
        if response.status_code == 200:
            os.makedirs(output_dir, exist_ok=True)
            filename = f'rd_{formatted_date}.pdf'
            with open(os.path.join(output_dir, filename), 'wb') as file:
                file.write(response.content)
            success_status = True
            break
    if not success_status:
        print(f"Error: Failed to download PDF from {list_of_urls[0][-14:]}")
    return success_status

def get_formatted_date(date):
    return date.strftime('%Y-%m-%d')

def get_date_range(start_date, end_date):
    dates = []
    current_date = start_date
    while current_date <= end_date:
        dates.append(current_date)
        current_date += timedelta(days=1)
    return dates

def scrape_rodong_sinmun(start_date, end_date=None):
    output_dir = datetime.now().strftime('%Y-%m-%d')

    if end_date:
        dates = get_date_range(start_date, end_date)
    else:
        dates = [start_date]
    # Download the PDFs:
    download_count = 0
    for date in tqdm(dates, desc="Downloading PDFs"):
        path_year = date.strftime("%Y")
        path_month = date.strftime("%m")
        path_day = date.strftime("%d")
        formatted_date = get_formatted_date(date)
        url_1 = f'https://kcnawatch.org/wp-content/uploads/sites/5/{path_year}/{path_month}/rodong-{formatted_date}.pdf'
        url_2 = f'https://kcnawatch.org/wp-content/uploads/sites/5/{path_year}/{path_month}/wpid-rodong-{formatted_date}.pdf'
        url_3 = f'https://kcnawatch.org/wp-content/uploads/sites/5/{path_year}/{path_month}/rodong-sinmun-{formatted_date}.pdf'
        url_4 = f'https://kcnawatch.org/wp-content/uploads/sites/5/2015/12/rodong-sinmun-{formatted_date}.pdf'
        url_5 = f'https://dprkmedia.com/wp-content/themes/KPMSite/pdf.php?media=rodong&pdf-year={path_year}&pdf-month={path_month}&pdf-day={path_day}'
        
        if download_pdf([url_1, url_2, url_3, url_4, url_5], output_dir, formatted_date):
            download_count += 1
        time.sleep(0.5) 

    if download_count == 0:
      print("\nNo issue has been downloaded.")
    else:
      if end_date is not None:
          print(f"\nDownloaded a total of {download_count} issues from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}.")
      else:
          print(f"\nDownloaded the issue of {start_date.strftime('%Y-%m-%d')}.")

In [None]:
start_date_str = '09/30/2013'  # replace with your start date
end_date_str = '12/23/2018'  # replace with your end date

start_date = datetime.strptime(start_date_str, '%m/%d/%Y')
end_date = datetime.strptime(end_date_str, '%m/%d/%Y') if end_date_str else None

In [None]:
scrape_rodong_sinmun(start_date, end_date)