In [1]:
import pandas as pd
import glob
import re
from itertools import product

観測地点

In [2]:
#観測地点の一覧
url = 'https://raw.githubusercontent.com/Nikkei-Visual-Data-Journalism/Heatwave/main/data-maxtemp/meta/points_list.csv'
points = pd.read_csv(url)

In [3]:
#地点、日付の組み合わせ
prec_no = set(points.prec_no)
dates = pd.date_range('1950-01-01','2023-07-01',freq='MS')

取得済みのデータを呼び出し

In [4]:
#観測地点ごとの過去データ
file_dir = "./data-maxtemp/timeseries-data-by-points/"

In [5]:
#ダウンロード済みファイル
file_list= glob.glob(f'{file_dir}data-raw/prec-*/**/*.csv', recursive=True)

In [6]:
pattern = r"/prec-(\d+)/jma-maxtemp-hs-\d+-(\d+)\.csv$"
data_list = []

for f in file_list:
    prec, yyyymm = re.search(pattern, f).groups()
    data =  {'prec_no': int(prec), 'yyyymm':yyyymm,'data':1,'filepath':f}
    data_list.append(data)
    
retrieved = pd.DataFrame(data_list)
retrieved.yyyymm = pd.to_datetime(retrieved.yyyymm, format='%Y%m')

In [7]:
retrieved.head()

Unnamed: 0,prec_no,yyyymm,data,filepath
0,14,2001-12-01,1,./data-maxtemp/timeseries-data-by-points/data-...
1,14,2001-06-01,1,./data-maxtemp/timeseries-data-by-points/data-...
2,14,2003-10-01,1,./data-maxtemp/timeseries-data-by-points/data-...
3,14,2003-04-01,1,./data-maxtemp/timeseries-data-by-points/data-...
4,14,2005-02-01,1,./data-maxtemp/timeseries-data-by-points/data-...


データを地点ごとに統合

In [None]:
for prec in prec_no:
    filepaths = retrieved[retrieved.prec_no==prec].dropna(subset='filepath').filepath.to_list()
    data_agg = pd.DataFrame()

    for filepath in filepaths:
        data_monthly = pd.read_csv(filepath)
        
        #)や]が入っているので掃除
        data_monthly.maxtemp = data_monthly.maxtemp.apply(lambda x: re.sub(r'[^\d\.-]', '', str(x)))
        data_monthly.maxtemp = pd.to_numeric(data_monthly.maxtemp, errors='coerce').astype(float)
        #まとめる
        data_agg = pd.concat([data_agg, data_monthly])
        #出力
        output_dir = f'{file_dir}data-agg-by-points/jma-maxtemp-hs-{prec}-merged.csv'
        data_agg.to_csv(output_dir, index=False)

In [None]:
data_agg

対象データの一覧

In [None]:
#取得候補を入れる箱
downloads = pd.DataFrame(list(product(prec_no, dates)), columns=['prec_no', 'yyyymm'])

In [None]:
#統合
downloads = pd.concat([downloads, retrieved])
downloads = downloads[~downloads.duplicated(subset=['prec_no','yyyymm'],keep='last')]
downloads = downloads.sort_values(by=['prec_no','yyyymm'])