In [1]:
import sys
sys.path.append("./src")

from river_helpers import *
from config import (dam_data_folder, rain_data_folder, discharge_data_folder,
                    dam_details_path, rain_details_path, discharge_details_path)

# 1. Scrap Dam data

In [4]:
measure = "dam"

#### Step 1: Get dam info

In [6]:
idxs = []
for page_idx in tqdm(range(n_pages[measure])):
    page = get_listing_page(page_idx, measure=measure)
    idxs += parse_idx(page)
    
details = {}
for idx in tqdm(idxs):
    html = get_detail_page(idx)
    details[idx] = parse_details(html)
    
with open(dam_details_path, "wb") as f:
    pickle.dump(details, f)

100%|██████████| 14/14 [00:02<00:00,  4.88it/s]


#### Step 2: Get availability

In [12]:
periods = {}
for k, v in tqdm(details.items()):
    if 1 in v["info"]:
        v=get_dam_availability(k)
        v=parse_availability(v)
        periods[k]=v
    else:
        print(f"{k} has no available data")

  3%|▎         | 4/131 [00:00<00:03, 35.23it/s]

1363230375020 has no available data
1363230375080 has no available data
1363230375130 has no available data


100%|██████████| 131/131 [00:15<00:00,  8.61it/s]

609999999999001 has no available data





#### Step 3: Scrap data

In [15]:
errors = []
os.makedirs(dam_outdir)

for idx, period in periods.items():
    try:
        data = [get_dam_data(idx, start, end) for (start, end) in tqdm(period)]
        data = pd.concat(data)
        data["timestamp"]=parse_timestamp(data)
        data.to_pickle(pj(dam_data_folder, str(idx)))
    except Exception as e:
        print(f"Error {e} happening for {idx}")
        errors.append(idx)

100%|██████████| 228/228 [01:20<00:00,  2.83it/s]
100%|██████████| 240/240 [01:27<00:00,  2.74it/s]
100%|██████████| 168/168 [01:02<00:00,  2.70it/s]
100%|██████████| 240/240 [01:28<00:00,  2.71it/s]
100%|██████████| 12/12 [00:02<00:00,  5.83it/s]
100%|██████████| 240/240 [01:25<00:00,  2.82it/s]
100%|██████████| 240/240 [01:27<00:00,  2.75it/s]
100%|██████████| 240/240 [01:28<00:00,  2.71it/s]
100%|██████████| 240/240 [01:27<00:00,  2.76it/s]
100%|██████████| 240/240 [01:27<00:00,  2.74it/s]
100%|██████████| 240/240 [01:28<00:00,  2.71it/s]
100%|██████████| 240/240 [01:28<00:00,  2.73it/s]
100%|██████████| 240/240 [01:30<00:00,  2.64it/s]
100%|██████████| 240/240 [01:27<00:00,  2.75it/s]
100%|██████████| 240/240 [01:28<00:00,  2.72it/s]
100%|██████████| 240/240 [01:27<00:00,  2.75it/s]
100%|██████████| 240/240 [01:26<00:00,  2.78it/s]
100%|██████████| 240/240 [01:28<00:00,  2.71it/s]
100%|██████████| 240/240 [01:29<00:00,  2.67it/s]
100%|██████████| 240/240 [01:28<00:00,  2.73it/s]
10

In [None]:
print(len(errors))

# 2. Scrap rain data

In [None]:
measure = "rain"

#### Step 1: Get rain gauge info

In [28]:
#Page 55 can not be downloaded, so we have to manually extract these idx:
idxs = [102081282211030, 102081282211040,102081282211050,102081282211060,102081282211070,
        102081282211080,102081282211090,102081282211100,102081282211110,102081282211120]
for page_idx in tqdm(list(range(55)) + list(range(56,n_pages[measure]))):
    page = get_listing_page(page_idx, measure=measure)
    idxs += parse_idx(page)
len(idxs)

# Error processing 2107-2110 and 2566
for idx in tqdmidxs[:2106]+ idxs[2111:2765]+idxs[2767:]:#
    html = get_detail_page(idx)
    detail = parse_details(html)
    if detail["緯度経度"]==detail["緯度経度"]:
        detail["coordinates"]=parse_loc(detail["緯度経度"])
        details[idx] = detail
        
with open(detail_path, "wb") as f:
    pickle.dump(details, f)

#### Step 2: Get availability

In [89]:
periods = {}
for k, v in tqdm(details.items()):
    if 1 in v["info"]:
        v=get_rain_availability(k)
        v=parse_availability(v)
        periods[k]=v
    else:
        print(f"{k} has no available data")
len(periods)

  1%|          | 17/2767 [00:02<04:18, 10.62it/s]

101011281104170 has no available data


  9%|▊         | 237/2767 [00:28<04:08, 10.17it/s]

101071281105170 has no available data


 24%|██▍       | 664/2767 [01:21<03:05, 11.32it/s]

102111282221100 has no available data


 28%|██▊       | 785/2767 [01:36<03:17, 10.04it/s]

103031283307090 has no available data


 45%|████▍     | 1242/2767 [02:31<00:41, 36.83it/s]

1041110624110 has no available data
1041110624120 has no available data
1041110724100 has no available data
1041110724170 has no available data
1041110724180 has no available data
1041110829370 has no available data
1041110829380 has no available data
1041110829590 has no available data
1041110829650 has no available data
1041110829680 has no available data


 50%|████▉     | 1371/2767 [02:47<02:05, 11.11it/s]

105051285516330 has no available data


 51%|█████     | 1400/2767 [02:50<02:14, 10.14it/s]

105061285512230 has no available data


 54%|█████▍    | 1500/2767 [03:02<01:56, 10.86it/s]

105091285506010 has no available data


 54%|█████▍    | 1502/2767 [03:02<01:38, 12.78it/s]

105091285506020 has no available data


 54%|█████▍    | 1506/2767 [03:03<01:35, 13.22it/s]

105091285506030 has no available data


 54%|█████▍    | 1508/2767 [03:03<01:25, 14.65it/s]

105091285506040 has no available data


 55%|█████▍    | 1512/2767 [03:03<01:36, 13.02it/s]

105091285506051 has no available data
105091285506070 has no available data


 55%|█████▌    | 1533/2767 [03:05<01:47, 11.47it/s]

105091285519130 has no available data


 56%|█████▋    | 1559/2767 [03:08<01:47, 11.28it/s]

105121285514200 has no available data


 57%|█████▋    | 1591/2767 [03:11<01:12, 16.23it/s]

106011286622080 has no available data
106011286622090 has no available data
106011286622100 has no available data


 60%|██████    | 1674/2767 [03:21<01:36, 11.34it/s]

106041286603250 has no available data


 61%|██████    | 1678/2767 [03:22<01:32, 11.81it/s]

106041286603300 has no available data


 61%|██████▏   | 1700/2767 [03:24<01:40, 10.65it/s]

106041286606160 has no available data


 62%|██████▏   | 1706/2767 [03:25<01:40, 10.51it/s]

106041286606220 has no available data


 63%|██████▎   | 1756/2767 [03:31<01:15, 13.33it/s]

106041286617380 has no available data
106041286617390 has no available data
106041286617521 has no available data


 78%|███████▊  | 2159/2767 [04:20<00:53, 11.32it/s]

108011288805903 has no available data


 80%|████████  | 2220/2767 [04:26<00:37, 14.52it/s]

108031288809911 has no available data
108031288809912 has no available data


 81%|████████  | 2247/2767 [04:29<00:32, 15.93it/s]

108041288808200 has no available data
108041288808210 has no available data
108041288808220 has no available data


 85%|████████▌ | 2353/2767 [04:42<00:26, 15.63it/s]

108081288804010 has no available data
108081288804011 has no available data
108081288804012 has no available data


 96%|█████████▋| 2669/2767 [05:25<00:09, 10.42it/s]

1361130130110 has no available data


 97%|█████████▋| 2693/2767 [05:28<00:07, 10.49it/s]

1361130478050 has no available data


 98%|█████████▊| 2700/2767 [05:28<00:06, 10.91it/s]

1361150931780 has no available data
1361150931990 has no available data


 99%|█████████▊| 2728/2767 [05:32<00:03, 11.37it/s]

1361160473380 has no available data


 99%|█████████▉| 2738/2767 [05:32<00:02, 13.15it/s]

1361160935050 has no available data
1361160935090 has no available data
1361180361040 has no available data


 99%|█████████▉| 2740/2767 [05:32<00:01, 14.08it/s]

1361180730340 has no available data
1361180730360 has no available data
1361180730380 has no available data
1361180730400 has no available data
1361180775130 has no available data


 99%|█████████▉| 2753/2767 [05:33<00:00, 19.66it/s]

1361191465021 has no available data
1361191465061 has no available data
1361191465070 has no available data
1361191565041 has no available data
2041110332690 has no available data


100%|██████████| 2767/2767 [05:35<00:00,  8.25it/s]


2706

#### Step 3: Download all data

In [None]:
os.makedirs(outdir)
errors = []
for idx, period in periods.items():
    try:
        data = [get_rain_data(idx, start, end) for (start, end) in tqdm(period)]
        data = pd.concat(data)
        data["timestamp"]=parse_timestamp(data)
        data.to_pickle(pj(outdir, str(idx)))
    except Exception as e:
        print(e)
        print(f"Error {e} happening with {idx}")
        errors.append(idx)

# 3. Discharge data

In [None]:
measure = "gauge"

#### Step 1: Get dicharge data

In [7]:
idxs = []
for page_idx in tqdm(range(n_pages[measure])):
    page = get_listing_page(page_idx, measure=measure)
    idxs += parse_idx(page)
    
details = {}
for idx in tqdm(idxs):
    html = get_detail_page(idx)
    details[idx] = parse_details(html)
    
with open(detail_path, "wb") as f:
    pickle.dump(details, f)

100%|██████████| 220/220 [02:14<00:00,  1.63it/s]


#### Step 2: Availability

In [14]:
periods = {}
for k, v in tqdm(details.items()):
    if 1 in v["info"]:
        v=get_gauge_availability(k)
        v=parse_availability(v)
        periods[k]=v
    else:
        print(f"{k} has no available data")

  0%|          | 9/2121 [00:00<01:02, 33.89it/s]

1042110624080 has no available data
1042110624090 has no available data
1042110724060 has no available data
1042110724070 has no available data
1042110931140 has no available data
1042110931910 has no available data


  3%|▎         | 58/2121 [00:05<02:54, 11.81it/s]

1362140331010 has no available data


  3%|▎         | 60/2121 [00:06<03:10, 10.79it/s]

1362160341290 has no available data
1362160341300 has no available data
1362160341320 has no available data


  4%|▎         | 75/2121 [00:07<02:52, 11.89it/s]

1362160700010 has no available data


 21%|██        | 441/2121 [00:53<02:30, 11.16it/s]

302021282206040 has no available data


 21%|██        | 445/2121 [00:53<02:15, 12.39it/s]

302021282206990 has no available data


 26%|██▌       | 555/2121 [01:06<02:24, 10.86it/s]

30204128222030 has no available data


 27%|██▋       | 576/2121 [01:08<01:39, 15.46it/s]

302051282201960 has no available data
302051282201970 has no available data
302051282201980 has no available data


 29%|██▊       | 609/2121 [01:11<00:58, 25.94it/s]

302071282201920 has no available data
302071282201930 has no available data
302071282201940 has no available data
302071282201950 has no available data
302071282201960 has no available data
302071282201970 has no available data
302071282201980 has no available data
302071282201990 has no available data


 31%|███       | 651/2121 [01:16<02:06, 11.61it/s]

302091282210910 has no available data


 34%|███▍      | 728/2121 [01:26<02:10, 10.68it/s]

302111282221080 has no available data


 54%|█████▍    | 1146/2121 [02:18<01:23, 11.66it/s]

305031285523030 has no available data


 63%|██████▎   | 1327/2121 [02:38<01:09, 11.51it/s]

306011286622030 has no available data


 63%|██████▎   | 1329/2121 [02:38<01:00, 13.04it/s]

306011286622060 has no available data


 63%|██████▎   | 1333/2121 [02:39<01:02, 12.52it/s]

306021286614035 has no available data


 65%|██████▍   | 1377/2121 [02:44<01:07, 11.07it/s]

306041286603010 has no available data


 66%|██████▌   | 1393/2121 [02:45<00:54, 13.28it/s]

306041286603160 has no available data
306041286603170 has no available data


 66%|██████▌   | 1403/2121 [02:46<00:32, 22.30it/s]

306041286603220 has no available data
306041286603230 has no available data
306041286603240 has no available data
306041286603250 has no available data
306041286603260 has no available data
306041286603270 has no available data


 68%|██████▊   | 1442/2121 [02:50<01:05, 10.34it/s]

306041286608035 has no available data


 69%|██████▉   | 1459/2121 [02:52<01:01, 10.83it/s]

306041286617050 has no available data


 69%|██████▉   | 1461/2121 [02:52<00:55, 11.99it/s]

306041286617080 has no available data


 69%|██████▉   | 1472/2121 [02:53<00:49, 13.00it/s]

306041286617180 has no available data
306041286617260 has no available data


 70%|███████   | 1490/2121 [02:55<00:55, 11.40it/s]

306051286611140 has no available data


 73%|███████▎  | 1545/2121 [03:02<00:39, 14.40it/s]

306091286605210 has no available data
306091286605220 has no available data
306091286605230 has no available data
306091286605250 has no available data
306091286605260 has no available data
306091286605270 has no available data
306091286605280 has no available data


 84%|████████▎ | 1774/2121 [03:27<00:17, 19.50it/s]

308011288805903 has no available data
308011288805904 has no available data
308011288805905 has no available data
308011288805906 has no available data
308011288805907 has no available data


 87%|████████▋ | 1851/2121 [03:35<00:18, 14.29it/s]

308061288803102 has no available data
308061288803103 has no available data


 89%|████████▉ | 1891/2121 [03:39<00:16, 13.80it/s]

308081288804009 has no available data
308081288804010 has no available data


100%|██████████| 2121/2121 [04:07<00:00,  8.56it/s]


#### Step 3: download

In [None]:
errors = []
os.makedirs(outdir)

for idx, period in tqdm(periods.items()):
    try:
        data = [get_gauge_data(idx, start, end) for (start, end) in period]
        data = pd.concat(data)
        data["timestamp"]=parse_timestamp(data)
        data.to_pickle(pj(outdir, str(idx)))
    except Exception as e:
        print(f"Error {e} happening with {idx}")
        errors.append(idx)

In [None]:
print(len(errors))

# 4. Wikipedia meta data

In [None]:
wikipedia_detail_path = f"{ROOT}/wiki_details.pkl"

In [4]:
with open(dam_detail_path, "rb") as f:
    dams = pickle.load(f)
    
dam_names = {k:v["観測所名"].split("（")[0] for k,v in dams.items()}
res = {k:query(v) for k,v in dam_names.items()}
df = pd.DataFrame(res).T
df["水力発電"]=df["利用目的"].apply(lambda x:any(["発電" in y for y in x]) if isinstance(x, list) else False)
df["水力発電"].mean()
df.to_pickle(wikipedia_detail_path)