Downloading pdf files from JMA

In [10]:
import requests
import shutil
import os
from datetime import datetime, timedelta

# enter start/end dates here, convert to datetime object
start_date = datetime.strptime('2021-1-19', '%Y-%m-%d')  # the latter arg defines the format of the given string
end_date = datetime.strptime('2021-9-20', '%Y-%m-%d')

# need to know how many days we need to loop over
day_count = (end_date - start_date).days + 1

# loop over days and download file
for i in range(day_count):
    date_str = (start_date + timedelta(i)).strftime('%y%m%d')  # convert our datetime obj into the desired string format
    year_str = (start_date + timedelta(i)).strftime('%y')  # convert our datetime obj into the desired string format
    month_str = (start_date + timedelta(i)).strftime('%m')  # convert our datetime obj into the desired string format
    r = requests.get(f'https://www.data.jma.go.jp/gmd/kaiyou/data/db/wave/chart/daily/pdf/jp/{year_str}/{month_str}/{date_str}00jp.pdf', stream=True)
    if r.status_code == 200:
        with open(f'{date_str}hr00.pdf', 'wb') as f:
            r.raw.decode_content = True
            shutil.copyfileobj(r.raw, f)
    r = requests.get(f'https://www.data.jma.go.jp/gmd/kaiyou/data/db/wave/chart/daily/pdf/jp/{year_str}/{month_str}/{date_str}12jp.pdf', stream=True)
    if r.status_code == 200:
        with open(f'{date_str}hr12.pdf', 'wb') as f:
            r.raw.decode_content = True
            shutil.copyfileobj(r.raw, f)

Converting pdf into jpg

In [14]:
#pdf2image also requires poppler: conda install -c conda-forge poppler
from pdf2image import convert_from_path

# loop over days and download file
for i in range(day_count):
    date_str = (start_date + timedelta(i)).strftime('%y%m%d')  # convert our datetime obj into the desired string format
    PDF_file = str(date_str)+"hr00.pdf"
    pages = convert_from_path(PDF_file, 500)
    # Iterate through all the pages stored above
    for page in pages:
        # Declaring filename for each page of PDF as JPG
        # For each page, filename will be:
        # PDF page 1 -> page_1.jpg
        # PDF page 2 -> page_2.jpg
        # PDF page 3 -> page_3.jpg
        # ....
        # PDF page n -> page_n.jpg
        filename = str(date_str)+"hr00.jpg"

        # Save the image of the page in system
        page.save(filename, 'JPEG')
    PDF_file2 = str(date_str)+"hr12.pdf"
    pages2 = convert_from_path(PDF_file2, 500)
    # Iterate through all the pages stored above
    for page in pages2:
        # Declaring filename for each page of PDF as JPG
        # For each page, filename will be:
        # PDF page 1 -> page_1.jpg
        # PDF page 2 -> page_2.jpg
        # PDF page 3 -> page_3.jpg
        # ....
        # PDF page n -> page_n.jpg
        filename = str(date_str)+"hr12.jpg"

        # Save the image of the page in system
        page.save(filename, 'JPEG')

Cropping JPG files into representative point "E"

In [16]:
import cv2

# loop over days and download file
for i in range(day_count):
    date_str = (start_date + timedelta(i)).strftime('%y%m%d')  # convert our datetime obj into the desired string format
    filename = str(date_str)+"hr00.jpg"
    img = cv2.imread(filename)
    crop_img = img[1673:1750, 330:800]
    cv2.imwrite("crop"+str(date_str)+"hr00.jpg", crop_img)
    filename2 = str(date_str)+"hr12.jpg"
    img2 = cv2.imread(filename2)
    crop_img2 = img2[1673:1750, 330:800]
    cv2.imwrite("crop"+str(date_str)+"hr12.jpg", crop_img2)

Performing OCR with Tesseract on cropped JPG files

In [None]:
#install pytesseract before, if you haven't: conda install -c conda-forge pytesseract
import pytesseract
pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files/Tesseract-OCR/tesseract.exe' #Install tesseract OCR https://github.com/UB-Mannheim/tesseract/wiki and insert route to tesseract exe, if you installed it somewhere else

# A text file is created and flushed
file = open("results2.txt", "w+")
file.write("")
file.close()

# get grayscale image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

#thresholding
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

# loop over days and download file
for i in range(day_count):
    date_str = (start_date + timedelta(i)).strftime('%y%m%d')  # convert our datetime obj into the desired string format
    # Open the file in append mode
    file = open("results2.txt", "a")
    filename = "crop"+str(date_str)+"hr00.jpg"
    img = cv2.imread(filename)
    # Preprocessing the image starts

    gray = get_grayscale(img)
    thresh = thresholding(gray)

    # Adding custom options
    custom_config = r'-c tessedit_char_whitelist="NESW1234567890 ./" --oem 3 --psm 7'
    text = pytesseract.image_to_string(thresh, config=custom_config)
    # Appending the text into file
    file.write(date_str+"00 "+text)

    filename2 = "crop"+str(date_str)+"hr12.jpg"
    img2 = cv2.imread(filename2)
    gray2 = get_grayscale(img2)
    thresh2 = thresholding(gray2)
    text2 = pytesseract.image_to_string(thresh2, config=custom_config)
    # Appending the text into file
    file.write(date_str+"12 "+text2)
    # Close the file
    file.close

Turning cleaned txt file into csv

In [2]:
import pandas as pd
df = pd.read_csv(f'results2_cleaned.txt', delim_whitespace=True)
df.columns=['Date', 'Direction', 'Period', 'Height']
print(df)

         Date Direction Period  Height
0    21030112       SSE      9     1.8
1    21030200         S      8     3.3
2    21030212         S      9     3.3
3    21030300       NNE      8     3.5
4    21030312       ENE      8     2.9
..        ...       ...    ...     ...
400  21091712         E      9     2.0
401  21091800       SSE      7     2.7
402  21091812         S      8     3.2
403  21091900       ENE      7     2.9
404  21091912        NE      7     2.7

[405 rows x 4 columns]


In [6]:
df['Date'].to_datetime()
print(df)

AttributeError: 'Series' object has no attribute 'to_datetime'