In [59]:
import numpy as np
import pandas as pd
import pathlib
import sys
import os
from datetime import date,datetime,timedelta

ROOT_PATH = pathlib.Path().resolve().parents[0]
sys.path.append(str(ROOT_PATH))

from modules.date_range import date_range

In [2]:
amedas_url = pd.read_csv(ROOT_PATH/"amedas_url_list2.csv")
amemaster = pd.read_csv(ROOT_PATH/"ame_master_20230323.csv")
amemaster_kan = amemaster[amemaster["種類"] == "官"]
df = pd.merge(amedas_url,amemaster_kan,left_on="station",right_on="観測所名")

In [49]:
class ScrapeObsTable:
    def __init__(self) -> None:
        self.cols = ["時","現地","海面","降水量 (mm)","気温 (℃)","露点 温度 (℃)","蒸気圧 (hPa)","湿度 (％)",
                    "風速","風向","日照 時間 (h)","全天 日射量 (MJ/㎡)","降雪",
                    "積雪","天気","雲量","視程 (km)"]
    
    def check_value(self,value):
            if (value == "///") or (" ]" in str(value)) or ("×" in str(value)):
                return np.nan
            elif "--" in str(value):
                return 0
            elif ")" in str(value):
                return float(value.split(" )")[0])
            else:
                try:
                    return float(value)
                except ValueError:
                    return value
    
    def scrape(
        self,
        url : str,
        date : datetime
            ) -> pd.DataFrame:
        table = pd.read_html(url)[0]
        table.columns = self.cols
        
        for i in table.items():
            table[i[0]] = table[i[0]].apply(lambda x: self.check_value(x))
        table["datetime"] = date
        return table
    
    def save_csv(
        self,
        df : pd.DataFrame,
        filedir : str,
        filename : str
    ) -> None:
        os.makedirs(ROOT_PATH/filedir,exist_ok=True)
        df.to_csv(ROOT_PATH / filedir / filename,index=False)

In [64]:
s = ScrapeObsTable()

for d in date_range(date(2022,1,1),date(2023,1,1)):
    for i in df.iterrows():
        print(f"{d},{i[1]['観測所番号']}")
        url = i[1]["amedas_url"].replace("index.php","view/hourly_s1.php")
        a_split = url.split("=")
        url = a_split[0]+"=" + a_split[1]+"=" + a_split[2]+d.strftime("=%Y&month=%m&day=%d")
        try:
            table = s.scrape(url, d)
        except ImportError:
            continue
        
        s.save_csv(table,d.strftime(f"csv/{i[1]['観測所番号']}/%Y/%m"),d.strftime(f"%Y%m%d_{i[1]['観測所番号']}.csv"))

2022-01-01,11016
2022-01-01,11291
2022-01-01,11091
2022-01-01,11061
2022-01-01,12442
2022-01-01,12501
2022-01-01,13181
2022-01-01,13277
2022-01-01,81286
2022-01-01,81286
2022-01-01,14163
2022-01-01,14163
2022-01-01,14296
2022-01-01,15356


KeyboardInterrupt: 

In [63]:
url

'https://www.data.jma.go.jp/obd/stats/etrn/view/hourly_s1.php?prec_no=11&block_no=1512&year=2022&month=01&day=01'

In [61]:
!conda install html5lib

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.3.1
  latest version: 23.5.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.5.0



# All requested packages already installed.



In [52]:
df

Unnamed: 0,area,station,amedas_url,都府県振興局,観測所番号,種類,観測所名,ｶﾀｶﾅ名,気象情報等に表記する名称,所在地,緯度(度),緯度(分),経度(度),経度(分),海面上の高さ(ｍ),風速計の高さ(ｍ),温度計の高さ(ｍ),観測開始年月日,備考1,備考2
0,宗谷地方,稚内,https://www.data.jma.go.jp/obd/stats/etrn/inde...,宗谷,11016,官,稚内,ﾜｯｶﾅｲ,稚内市開運,稚内市開運　稚内地方気象台,45,24.9,141,40.7,3,24.1,－,#昭50.4.1,11903,－
1,宗谷地方,北見枝幸,https://www.data.jma.go.jp/obd/stats/etrn/inde...,宗谷,11291,官,北見枝幸,ｷﾀﾐｴｻｼ,枝幸町枝幸,枝幸郡枝幸町本町　北見枝幸特別地域気象観測所,44,56.4,142,35.1,7,17.2,－,#昭51.1.1,11917,－
2,宗谷地方,本泊,https://www.data.jma.go.jp/obd/stats/etrn/inde...,宗谷,11091,官,本泊,ﾓﾄﾄﾞﾏﾘ,利尻空港,利尻郡利尻富士町鴛泊字本泊　利尻航空気象観測所,45,14.5,141,11.2,30,10,－,平15.1.1,－,日照・湿度・気圧を除く
3,宗谷地方,声問,https://www.data.jma.go.jp/obd/stats/etrn/inde...,宗谷,11061,官,声問,ｺｴﾄｲ,稚内空港,稚内市大字声問村字声問　稚内航空気象観測所,45,24.2,141,48.1,8,10,－,平15.1.1,11904,日照・湿度・気圧を除く
4,上川地方,旭川,https://www.data.jma.go.jp/obd/stats/etrn/inde...,上川,12442,官,旭川,ｱｻﾋｶﾜ,旭川市宮前１条,旭川市宮前1条　旭川地方気象台,43,45.4,142,22.3,120,47,－,平16.9.9,12927,－
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264,沖縄県,所野,https://www.data.jma.go.jp/obd/stats/etrn/inde...,沖縄,94011,官,所野,ﾄｺﾛﾉ,与那国空港,八重山郡与那国町字与那国　与那国航空気象観測所,24,28.0,122,58.7,15,9.8,－,平15.1.1,－,日照・湿度・気圧を除く
265,沖縄県,北大東,https://www.data.jma.go.jp/obd/stats/etrn/inde...,沖縄,92006,官,北大東,ｷﾀﾀﾞｲﾄｳ,北大東空港,島尻郡北大東村字南　北大東航空気象観測所,25,56.6,131,19.6,22,9.8,－,平15.1.1,－,日照・湿度・気圧を除く
266,沖縄県,旧東,https://www.data.jma.go.jp/obd/stats/etrn/inde...,沖縄,92012,官,旧東,ｷｭｳﾄｳ,南大東空港,島尻郡南大東村字旧東　南大東航空気象観測所,25,50.8,131,15.8,48,9.8,－,平15.1.1,－,日照・湿度・気圧を除く
267,沖縄県,北原,https://www.data.jma.go.jp/obd/stats/etrn/inde...,沖縄,91141,官,北原,ｷﾀﾊﾗ,久米島空港,島尻郡久米島町字北原　久米島航空気象観測所,26,21.8,126,42.8,7,10,－,平15.1.1,－,日照・湿度・気圧を除く


In [59]:
date = datetime(2022,1,1)
a = "https://www.data.jma.go.jp/obd/stats/etrn/index.php?prec_no=11&block_no=47401&year=&month=&day=&view=".replace("index.php","view/hourly_s1.php")

a_split = a.split("=")
url = a_split[0]+"=" + a_split[1]+"=" + a_split[2]+date.strftime("=%Y&month=%m&day=%d")
url

'https://www.data.jma.go.jp/obd/stats/etrn/view/hourly_s1.php?prec_no=11&block_no=47401&year=2022&month=01&day=01'