# jma-maxtemp-00-STEP1-get-historical_data_by_points


* 作業内容
    * 気象庁[過去の気象データ検索](https://www.data.jma.go.jp/stats/etrn/index.php?prec_no=&block_no=&year=&month=&day=&view=)から各ブロックの「最高気温」のデータを取得

* 不定期運転
    * 最初のデータ作成時
    * 過去データを遡及して取得して期間をのばしたいとき
    * 月次データの更新など（やりたければ）
<br><br>
* 次のステップ（STEP2）
    * 出力されたブロック別データをもとに県別の日次の最高気温の推移を作成

In [1]:
import pandas as pd
import time
from datetime import datetime, date
import glob
import re

In [2]:
#観測地点のデータ
url = 'https://raw.githubusercontent.com/Nikkei-Visual-Data-Journalism/Heatwave/main/data-maxtemp/meta/points_list.csv'
points = pd.read_csv(url)

In [3]:
points.head()

Unnamed: 0,観測所番号,prec_no,国際地点番号,都道府県,name,地点,pref,capitol
0,11001,11,,北海道宗谷地方,宗谷岬,宗谷岬（ソウヤミサキ）,北海道,
1,11016,11,47401.0,北海道宗谷地方,稚内,稚内（ワッカナイ）,北海道,
2,11046,11,,北海道宗谷地方,礼文,礼文（レブン）,北海道,
3,11061,11,,北海道宗谷地方,声問,声問（コエトイ）,北海道,
4,11076,11,,北海道宗谷地方,浜鬼志別,浜鬼志別（ハマオニシベツ）,北海道,


In [4]:
def get_historical_maxtemp(prec_no, md):
  #変数
  #全域は00
  block_no = '00'
  year =md.year
  month = md.month
  #データ取得
  url = f'https://www.data.jma.go.jp/stats/etrn/view/daily_h1.php?prec_no={prec_no}&block_no={block_no}&year={year}&month={str(month).zfill(2)}&day=&view=p3'
  tables = pd.read_html(url)
  df = tables[0]
  df.columns = df.columns.str.replace("*", "", regex=False)
  df['date'] =md
  df['date'] = df.apply(lambda row: row['date'].replace(day=row['日']), axis=1)
  df = df.drop(['日'],axis=1).set_index('date')
  return df

取得済みのファイル

In [5]:
%pwd

'/Users/mio/Documents/GitHub/Nikkei-Visual&Data/Heatwave'

In [6]:
file_dir = "./data-maxtemp/timeseries-data-by-points/data-raw/"

In [7]:
file_list = glob.glob(f'{file_dir}prec-*/**/*.csv', recursive=True)

In [8]:
retrieved = pd.DataFrame(columns = ['prec_no','yyyymm','data'])

pattern = r"/prec-(\d+)/jma-maxtemp-hs-\d+-(\d+)\.csv$"

for f in file_list:
    prec, yyyymm = re.search(pattern, f).groups()
    data =  {'prec_no': [int(prec)], 'yyyymm':[yyyymm],'data':[1]}
    retrieved = pd.concat([retrieved, pd.DataFrame(data)])

In [9]:
retrieved.head()

Unnamed: 0,prec_no,yyyymm,data
0,64,201002,1
0,64,200308,1
0,64,201610,1
0,64,201604,1
0,64,200903,1


期間

In [10]:
dates = pd.date_range('2000-01-01', '2023-08-01', freq='M')

In [None]:
errors = []
for prec in points.prec_no.unique():
    points_n = points[points.prec_no==prec]
    points_n_dic = points_n.set_index('name')['観測所番号'].to_dict()
    
    for md in dates:
        try:
            #if 0 ==0:
            if retrieved[(retrieved.prec_no==prec)&(retrieved.yyyymm==md.strftime('%Y%m'))].shape[0] > 0:
                pass
                #print('file already retrieved:',prec, md)
            else:
                by_prec = get_historical_maxtemp(prec, md)
                by_prec = by_prec.rename_axis('name', axis=1).unstack().rename('maxtemp').reset_index()
                by_prec['prec_no'] = prec
                by_prec['points_no'] = by_prec.name.map(points_n_dic)

                #output monthly data
                filename = f"jma-maxtemp-hs-{prec}-{md.strftime('%Y%m')}.csv"
                filepath = f"{file_dir}prec-{prec}/{filename}"
                by_prec.to_csv(filepath, index= False)
                #print result
                print('OK:',prec, md)
                #time.sleep(1)
        except:
            errors.append({'date':md, 'prec_no':prec})
            #print result
            print('FAILED:',prec, md)

OK: 35 2018-06-30 00:00:00
OK: 35 2023-02-28 00:00:00
OK: 36 2000-02-29 00:00:00
OK: 48 2018-03-31 00:00:00
OK: 73 2004-03-31 00:00:00
OK: 73 2004-08-31 00:00:00
OK: 73 2004-10-31 00:00:00
OK: 73 2005-06-30 00:00:00
OK: 73 2005-07-31 00:00:00
OK: 73 2005-09-30 00:00:00
OK: 73 2008-12-31 00:00:00
OK: 73 2009-01-31 00:00:00
OK: 73 2009-02-28 00:00:00
OK: 73 2009-03-31 00:00:00
OK: 73 2009-04-30 00:00:00
OK: 73 2009-05-31 00:00:00
OK: 73 2009-06-30 00:00:00
OK: 73 2009-07-31 00:00:00
OK: 73 2009-08-31 00:00:00
OK: 73 2009-09-30 00:00:00
OK: 73 2009-10-31 00:00:00
OK: 73 2009-11-30 00:00:00
OK: 73 2009-12-31 00:00:00
OK: 73 2010-01-31 00:00:00
OK: 73 2010-02-28 00:00:00
OK: 73 2010-03-31 00:00:00
OK: 73 2010-04-30 00:00:00
OK: 73 2010-05-31 00:00:00
OK: 73 2010-06-30 00:00:00
OK: 73 2010-07-31 00:00:00
OK: 73 2010-08-31 00:00:00
OK: 73 2010-09-30 00:00:00
OK: 73 2010-10-31 00:00:00
OK: 73 2010-11-30 00:00:00
OK: 73 2010-12-31 00:00:00
OK: 73 2011-01-31 00:00:00
