In [1]:
import pandas as pd
import glob
import re
from itertools import product

観測地点

In [2]:
#観測地点の一覧
url = 'https://raw.githubusercontent.com/Nikkei-Visual-Data-Journalism/Heatwave/main/data-maxtemp/meta/points_list.csv'
points = pd.read_csv(url)

In [3]:
#地点、日付の組み合わせ
prec_no = set(points.prec_no)
dates = pd.date_range('1950-01-01','2023-07-01',freq='MS')

取得済みのデータを呼び出し

In [4]:
#観測地点ごとの過去データ
file_dir = "./data-maxtemp/timeseries-data-by-points/"

In [5]:
#ダウンロード済みファイル
file_list= glob.glob(f'{file_dir}data-raw/prec-*/**/*.csv', recursive=True)

In [6]:
pattern = r"/prec-(\d+)/jma-maxtemp-hs-\d+-(\d+)\.csv$"
data_list = []

for f in file_list:
    prec, yyyymm = re.search(pattern, f).groups()
    data =  {'prec_no': int(prec), 'yyyymm':yyyymm,'data':1,'filepath':f}
    data_list.append(data)
    
retrieved = pd.DataFrame(data_list)
retrieved.yyyymm = pd.to_datetime(retrieved.yyyymm, format='%Y%m')

In [7]:
retrieved.head()

Unnamed: 0,prec_no,yyyymm,data,filepath
0,31,2010-02-01,1,./data-maxtemp/timeseries-data-by-points/data-...
1,31,2003-08-01,1,./data-maxtemp/timeseries-data-by-points/data-...
2,31,2009-03-01,1,./data-maxtemp/timeseries-data-by-points/data-...
3,31,2014-12-01,1,./data-maxtemp/timeseries-data-by-points/data-...
4,31,2014-06-01,1,./data-maxtemp/timeseries-data-by-points/data-...


データを統合

In [None]:
data_all = pd.DataFrame()

for prec in prec_no:
    filepaths = retrieved[retrieved.prec_no==prec].dropna(subset='filepath').filepath.to_list()
    data_agg = pd.DataFrame()

    for filepath in filepaths:
        data_monthly = pd.read_csv(filepath)
        
        #)や]が入っているので掃除
        data_monthly.maxtemp = data_monthly.maxtemp.apply(lambda x: re.sub(r'[^\d\.-]', '', str(x)))
        data_monthly.maxtemp = pd.to_numeric(data_monthly.maxtemp, errors='coerce').astype(float)
        #まとめる
        data_agg = pd.concat([data_agg, data_monthly])
        #都道府県情報
        pref_dic = points.set_index('prec_no').pref.to_dict()
        data_agg['pref'] = data_agg.prec_no.map(pref_dic)
        #都道府県庁所在地
        capitol = points[points.capitol==1]['観測所番号'].to_list()
        data_agg['capitol'] = None
        data_agg.loc[data_agg.points_no.isin(capitol),'capitol'] = 1
        #日付、年
        data_agg.date = pd.to_datetime(data_agg.date)
        data_agg['year'] = data_agg.date.dt.year
        #地点ごとのデータを出力
        output_dir = f'{file_dir}data-agg-by-points/jma-maxtemp-hs-{prec}-merged.csv'
        data_agg.to_csv(output_dir, index=False)
        #１つのファイルに統合
        data_all = pd.concat([data_all,data_agg])

真夏日、猛暑日を計算

In [None]:
#県内の最高温度
df_count = data_all.groupby(['date','year','pref']).maxtemp.max()

In [None]:
#県庁所在地の最高温度をつけたす
capitol = data_all[data_all.capitol==1].set_index(['date','year','pref']).maxtemp.rename('maxtemp_capitol')
df_count = pd.concat([df_count, capitol],axis=1)

In [None]:
#フラグ
over30 = (df_count >=30).add_prefix('over30_')
over35 = (df_count >=35).add_prefix('over35_')
over40 = (df_count >=40).add_prefix('over40_')

In [None]:
df_count = pd.concat([df_count, over30, over35, over40],axis=1)

In [None]:
df_count.columns = df_count.columns.str.replace('_maxtemp', '', regex=False)

In [None]:
df_count['count'] = 1

In [None]:
df_count = df_count.reset_index()

In [None]:
file_dir = "./data-maxtemp/timeseries-data/jma-maxtemp-temp-by-pref-ts.csv"

In [None]:
df_count.to_csv(file_dir, index=False)

次のステップで最新データを入れてからやる

In [None]:
#次のステップで計算する
cols = [col for col in df_count if 'over' in col]
df_count.groupby(['year','pref'])[cols + ['count']].sum()

対象データの一覧

In [None]:
#取得候補を入れる箱
downloads = pd.DataFrame(list(product(prec_no, dates)), columns=['prec_no', 'yyyymm'])

In [None]:
#統合
downloads = pd.concat([downloads, retrieved])
downloads = downloads[~downloads.duplicated(subset=['prec_no','yyyymm'],keep='last')]
downloads = downloads.sort_values(by=['prec_no','yyyymm'])