# Extract exactly which files were unable to load during extraction of CAPPI Data

In [49]:
import numpy as np
import pandas as pd
from glob import glob
import datetime
from pyproj import Transformer
import matplotlib.pyplot as plt
import wradlib as wrl
import re
import geopandas as gpd

In [68]:
# Drainage and raingauge url
URL_gauge_data = 'C:/Users/sverrirhd/Google Drive/Skóli/DTU/Thesis/Data/Rain gauges/'
URL_gauge_2020_file = URL_gauge_data + 'clean2020data1475.csv'
URL_saved_sensor_data = URL_gauge_data + 'hbs_lysigogn.csv'
DIR_data = 'C:/Users/sverrirhd/vedurgogn/'
DIR_rain_gauges = glob(URL_gauge_data + '*cleaned*')

# Radar urls
# DIR_CAPPI = 'C:/Users/sverrirhd/OneDrive/CAPPI/'
DIR_CAPPI = 'F:/CAPPI/'

urls_cappi = np.array(glob(DIR_CAPPI + 'CAPPI_[0-9][0-9][0-9][0-9][0-9][0-9]*'))
urls_cappi_mask = np.array(glob(DIR_CAPPI + 'CAPPI_MASK*'))
urls_descr = np.array(glob(DIR_CAPPI + 'descriptive_data*'))
urls_probl = np.array(glob(DIR_CAPPI + 'problematic_indexes*'))
urls_urls = np.array(glob(DIR_CAPPI + 'urls*'))

urls_cappi_2020 = urls_cappi[['2020' in i for i in urls_cappi]]
urls_cappi_mask_2020 = urls_cappi_mask[['2020' in i for i in urls_cappi_mask]]
urls_descr_2020 = urls_descr[['2020' in i for i in urls_descr]]
urls_probl_2020 = urls_probl[['2020' in i for i in urls_probl] ]
urls_urls_2020 = urls_urls[['2020' in i for i in urls_urls]]

urls_cappi = urls_cappi[[url not in urls_cappi_2020 for url in urls_cappi]]
urls_cappi_mask = urls_cappi_mask[[url not in urls_cappi_mask_2020 for url in urls_cappi_mask]]
urls_descr = urls_descr[[url not in urls_descr_2020 for url in urls_descr]]
urls_probl = urls_probl[[url not in urls_probl_2020 for url in urls_probl]]
urls_urls = urls_urls[[url not in urls_urls_2020 for url in urls_urls]]

df_file_urls = pd.DataFrame([urls_cappi,urls_cappi_mask,urls_descr,urls_probl,urls_urls]).T
df_file_urls.columns = ['CAPPI','CAPPI_mask','descriptive_data','problematic_indexes','urls']
df_file_urls.loc[:,'yearmonth'] = df_file_urls.CAPPI.str.findall('[0-9][0-9][0-9][0-9][0-9][0-9]').apply(lambda x : x[0])

df_file_urls.loc[:,'filetype'] = df_file_urls.CAPPI.str.findall('hdf5').apply(lambda x: 'hdf5' if len(x) > 0 else 'h5')


In [None]:
problematic_urls = {}

In [75]:
# Takes about 90s (on hard drive, but more like 20 minutes from external hard drive) 
for index,row in df_file_urls.iterrows():
    problematic_indexes_urls = row['problematic_indexes']
    
    year_month = row['yearmonth']
    filetype = row['filetype']
    key = (year_month,filetype)
    print(key)
    prob_indx = np.load(problematic_indexes_urls, allow_pickle=True)
    
    problematic_urls[key] =  prob_indx

('201501', 'h5')
('201502', 'h5')
('201503', 'h5')
('201504', 'h5')
('201505', 'h5')
('201506', 'h5')
('201507', 'h5')
('201508', 'h5')
('201509', 'h5')
('201510', 'h5')
('201511', 'h5')
('201512', 'h5')
('201601', 'h5')
('201602', 'h5')
('201603', 'h5')
('201604', 'h5')
('201605', 'h5')
('201606', 'h5')
('201607', 'h5')
('201608', 'h5')
('201609', 'h5')
('201610', 'h5')
('201611', 'h5')
('201612', 'h5')
('201701', 'h5')
('201702', 'h5')
('201703', 'h5')
('201704', 'h5')
('201705', 'h5')
('201706', 'h5')
('201707', 'h5')
('201708', 'h5')
('201709', 'h5')
('201710', 'h5')
('201711', 'h5')
('201712', 'h5')
('201801', 'h5')
('201801', 'hdf5')
('201802', 'h5')
('201802', 'hdf5')
('201803', 'h5')
('201803', 'hdf5')
('201804', 'hdf5')
('201805', 'hdf5')
('201806', 'hdf5')
('201807', 'hdf5')
('201808', 'hdf5')
('201809', 'hdf5')
('201810', 'hdf5')
('201811', 'hdf5')
('201812', 'hdf5')
('201901', 'hdf5')
('201902', 'hdf5')
('201903', 'hdf5')
('201904', 'hdf5')
('201905', 'hdf5')
('201906', 'hd

# Recreate the index dataframe

In [80]:
DIR_vedur = 'F:/'
df_meta = pd.read_csv('./Analysis/file_metadata.csv',index_col=0)
df_meta.url = df_meta.url.str.replace('C:/Users/sverrirhd/vedurgogn//',DIR_vedur)
df_meta.ctime = pd.to_datetime(df_meta.ctime)
df_meta.loc[:,'year and month'] = df_meta.ctime.apply(lambda x : str(x.year) + '%02d' % x.month)
months = df_meta.loc[:,'year and month'].sort_index().values

In [81]:
df_meta_h5 = df_meta.loc[df_meta.loc[:,'extension'] == '.H5']
df_meta_h5_group_lists = df_meta_h5.groupby('ctime').apply(lambda x: list(x.index))


In [108]:
bad_timestamps = []
for key in problematic_urls:
    bad_index_list = problematic_urls[key]
    for bi in bad_index_list:
        matches = df_meta_h5_group_lists.apply(lambda x : x == bi).apply(sum) > 0
        bad_ts = df_meta_h5_group_lists.loc[matches].index
        # print(bad_ts)
        bad_timestamps.append(bad_ts)

DatetimeIndex(['2015-01-24 18:45:00'], dtype='datetime64[ns]', name='ctime', freq=None)
DatetimeIndex(['2015-03-06 09:30:00'], dtype='datetime64[ns]', name='ctime', freq=None)
DatetimeIndex(['2015-04-04 18:00:00'], dtype='datetime64[ns]', name='ctime', freq=None)
DatetimeIndex(['2015-05-02 22:30:00'], dtype='datetime64[ns]', name='ctime', freq=None)
DatetimeIndex(['2015-06-04 19:45:00'], dtype='datetime64[ns]', name='ctime', freq=None)
DatetimeIndex(['2015-07-10 15:30:00'], dtype='datetime64[ns]', name='ctime', freq=None)
DatetimeIndex(['2015-08-24 08:30:00'], dtype='datetime64[ns]', name='ctime', freq=None)
DatetimeIndex(['2015-08-25 22:15:00'], dtype='datetime64[ns]', name='ctime', freq=None)
DatetimeIndex(['2016-01-05 05:30:00'], dtype='datetime64[ns]', name='ctime', freq=None)
DatetimeIndex(['2016-03-04 19:00:00'], dtype='datetime64[ns]', name='ctime', freq=None)
DatetimeIndex(['2016-06-25 11:15:00'], dtype='datetime64[ns]', name='ctime', freq=None)
DatetimeIndex(['2016-07-02 19:00

In [114]:
DIR_save = 'F:/CAPPI/'
np.save(DIR_save + 'missing_dates_all', bad_timestamps, allow_pickle=True)


# go through each date_file and remove the file if it's in missing dates and then resave

In [154]:
extract_datestring = lambda x : re.findall('20[1-2][5-9][0-1][0-9][0-3][0-9][0-2][0-9][0-5][0-9]',x.replace('_',''))
extract_date = lambda x : datetime.datetime(int(x[:4]),int(x[4:6]),int(x[6:8]),int(x[8:10]),int(x[10:12]))
for index,row in df_file_urls.iterrows():
    # Load the data
    urls = row['urls']
    year_month = row['yearmonth']
    filetype = row['filetype']
    CAPPI_url = row['CAPPI']
    problematic_indx = row['problematic_indexes']
    
    key = (year_month,filetype)

    # Match the date to the index from the original_urls file
    raw_urls = np.load(urls,allow_pickle=True)
    datestrings = [extract_datestring(url[0]) for url in raw_urls] 
    datetimes = [extract_date(datestr[0]) for datestr in datestrings]
    removeable_index = np.argwhere([i in dt_missing_dates for i in datetimes]).ravel()
    
    urls_new = np.delete(raw_urls,removeable_index,0)
    np.save(urls, urls_new,allow_pickle=True)

In [140]:
dt_missing_dates = pd.to_datetime(np.array(bad_timestamps)[:,0])

In [152]:
a = np.array([1,2,3])
b = np.delete(a,0)

In [153]:
a,b

(array([1, 2, 3]), array([2, 3]))

In [None]:
np.argwhere()

Unnamed: 0,CAPPI,CAPPI_mask,descriptive_data,problematic_indexes,urls,yearmonth,filetype
0,F:/CAPPI\CAPPI_201501.npy,F:/CAPPI\CAPPI_MASK_201501.npy,F:/CAPPI\descriptive_data201501.npy,F:/CAPPI\problematic_indexes201501.npy,F:/CAPPI\urls201501.npy,201501,h5
1,F:/CAPPI\CAPPI_201502.npy,F:/CAPPI\CAPPI_MASK_201502.npy,F:/CAPPI\descriptive_data201502.npy,F:/CAPPI\problematic_indexes201502.npy,F:/CAPPI\urls201502.npy,201502,h5
2,F:/CAPPI\CAPPI_201503.npy,F:/CAPPI\CAPPI_MASK_201503.npy,F:/CAPPI\descriptive_data201503.npy,F:/CAPPI\problematic_indexes201503.npy,F:/CAPPI\urls201503.npy,201503,h5
3,F:/CAPPI\CAPPI_201504.npy,F:/CAPPI\CAPPI_MASK_201504.npy,F:/CAPPI\descriptive_data201504.npy,F:/CAPPI\problematic_indexes201504.npy,F:/CAPPI\urls201504.npy,201504,h5
4,F:/CAPPI\CAPPI_201505.npy,F:/CAPPI\CAPPI_MASK_201505.npy,F:/CAPPI\descriptive_data201505.npy,F:/CAPPI\problematic_indexes201505.npy,F:/CAPPI\urls201505.npy,201505,h5
...,...,...,...,...,...,...,...
58,F:/CAPPI\CAPPI_201908_hdf5.npy,F:/CAPPI\CAPPI_MASK_201908_hdf5.npy,F:/CAPPI\descriptive_data201908_hdf5.npy,F:/CAPPI\problematic_indexes201908_hdf5.npy,F:/CAPPI\urls201908_hdf5.npy,201908,hdf5
59,F:/CAPPI\CAPPI_201909_hdf5.npy,F:/CAPPI\CAPPI_MASK_201909_hdf5.npy,F:/CAPPI\descriptive_data201909_hdf5.npy,F:/CAPPI\problematic_indexes201909_hdf5.npy,F:/CAPPI\urls201909_hdf5.npy,201909,hdf5
60,F:/CAPPI\CAPPI_201910_hdf5.npy,F:/CAPPI\CAPPI_MASK_201910_hdf5.npy,F:/CAPPI\descriptive_data201910_hdf5.npy,F:/CAPPI\problematic_indexes201910_hdf5.npy,F:/CAPPI\urls201910_hdf5.npy,201910,hdf5
61,F:/CAPPI\CAPPI_201911_hdf5.npy,F:/CAPPI\CAPPI_MASK_201911_hdf5.npy,F:/CAPPI\descriptive_data201911_hdf5.npy,F:/CAPPI\problematic_indexes201911_hdf5.npy,F:/CAPPI\urls201911_hdf5.npy,201911,hdf5


In [None]:
missing_dates = np.load(DIR_save + 'missing_dates_all.npy', allow_pickle=True)
dt_missing_dates = pd.to_datetime(missing_dates[:,0])
any([i in dt_missing_dates for i in datetimes])
    # Because of missing files that werent logged originally
    datetimes = [i for i in datetimes if i not in dt_missing_dates]
    print(len(datestrings),len(datetimes))

In [76]:
all_indxes = np.concatenate([i.ravel() for i in list(problematic_urls.values())])

In [46]:
all_indxes

array([  27393.,   27394.,   27395.,   27396.,   27397.,   27398.,
         27399.,   27400.,   27401.,   27402.,   27403.,   27404.,
         73611.,   73612.,   73613.,   73614.,   73615.,   73616.,
         73617.,   73618.,   73619.,   73620.,   73621.,   73622.,
        107369.,  107370.,  107371.,  107372.,  107373.,  107374.,
        107375.,  107376.,  107377.,  107378.,  107379.,  107380.,
        139757.,  139758.,  139759.,  139760.,  139761.,  139762.,
        139763.,  139764.,  139765.,  139766.,  139767.,  139768.,
        177041.,  177042.,  177043.,  177044.,  177045.,  177046.,
        177047.,  177048.,  177049.,  177050.,  177051.,  177052.,
        218311.,  218312.,  218313.,  218314.,  218315.,  218316.,
        218317.,  218318.,  218319.,  218320.,  218321.,  218322.,
        257571.,  257572.,  257573.,  257574.,  257575.,  257576.,
        257577.,  257578.,  257579.,  257580.,  257581.,  257582.,
        259359.,  259360.,  259361.,  259362.,  259363.,  2593

In [45]:
pd.Series(all_indxes).value_counts()

27393.0      1
864675.0     1
864683.0     1
864682.0     1
864681.0     1
            ..
479720.0     1
479719.0     1
479718.0     1
479717.0     1
1321344.0    1
Length: 348, dtype: int64