In [4]:
import pandas as pd
import numpy as np
import re

import sys
import os
import  argparse
import datetime
import numpy as np

import matplotlib as mpl 
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.ticker as mticker

import warnings
warnings.simplefilter("ignore")

#import matplotlib as mpl
import pandas as pd
import h5py

TYPES   = ['vis047','vis086','ir133','ssr','sza','insitu']
data_path = '/gpfs/data1/lianggp/lir/solar_data/data/geonex_sat'
file_path = '{}/SolarSat_{}_{}_{}.hdf'
insitu_path = '/gpfs/data1/lianggp/lir/2018/surfrad/insitu/'
DEFAULT_DATA_HOME = '/gpfs/data1/lianggp/lir/solar_data/data/geonex_sat'
DEFAULT_TILELIST   = DEFAULT_DATA_HOME + '/solarsat_sitelist.csv'
DEFAULT_INSITU = DEFAULT_DATA_HOME+'/solarsat_insitu.csv'

In [2]:
def generate_time_all(year=2018, interval=15):
    # Start and end dates for the year
    start_date = f"{year}-01-01"
    end_date = f"{year}-12-31 23:"+'{0:02}'.format(60-interval)

    # Generate datetime range with 15-minute intervals
    dt_range = pd.date_range(start=start_date, end=end_date, freq='{0:02}'.format(interval)+'T')

    # Convert to 'YYYYDDDHHMM' format
    #time_data = dt_range.strftime('%Y%j%H%M').astype(np.int32)  # %j is day of year as a zero-padded decimal number

    return dt_range

def genertate_time(tile_id, start, end, year=2018, interval=15):
    dt_range = generate_time_all(year, interval)    
    utc_time = dt_range[start:end]
    
    sites = sitelist.groupby('tile_id').get_group(tile_id)
    meta = sites[0:1].squeeze()
    dt_offset = pd.Timedelta(hours=meta['time_offset'])
    local_time = utc_time + dt_offset

    return utc_time, local_time
    
def genertate_insitu(site_id, window_size=15, interval = 15):
    df_insitu=pd.read_pickle(insitu_path+str(site_id)+'.pkl')
    swin = np.array(df_insitu.swin)
    window = np.ones(window_size) / window_size
    half_window = window_size // 2

    # Pad the start and end of the swin array to handle boundary effects
    padded_swin = np.pad(swin, (half_window, half_window), mode='edge')

    # Compute convolution
    convolution = np.convolve(padded_swin, window, mode='valid')

    # Select the means every 15th element starting from the first complete window
    results = convolution[::15]


    return results

In [3]:
def get_data(tile_id, types, path, start=0, end=365*24*4  ):
    """ 
    returns dict { img_type : {"meta" : META, "data": DATA} }
    """
    sites = sitelist.groupby('tile_id').get_group(tile_id)
    data = {}
    meta = sites[0:1].squeeze()
    data['meta']=meta
    data['meta']['local_time']=genertate_time(tile_id, start, end, year=2018, interval=15)[1]
    data['meta']['utc_time']=genertate_time(tile_id, start, end, year=2018, interval=15)[0]
    for typ in types:
        data[typ]={}
        file_name=file_path.format(path, year, tile_id, typ)
        if typ == 'insitu':
            swin = genertate_insitu(data['meta']['id'], window_size=15, interval = 15)
            data[typ]['data'] = swin[start:end]  
        else:
            with h5py.File(file_name,'r') as hf:  
                fillvalue = hf[typ].attrs.get('fillvalue', None)
                scale_factor = hf[typ].attrs.get('scale_factor', None)
                array = hf[typ][start:end]   
                array = array * scale_factor
                nan_value = fillvalue * scale_factor
                array[array==nan_value]=np.nan
                data[typ]['data'] = array
    return data

In [5]:
sitelist = pd.read_csv('/gpfs/data1/lianggp/lir/solar_data/code/15min_station.csv',low_memory=False)
sitelist

Unnamed: 0,id,name,lats,lons,network,iH,iV,fLine,fCol,elev,timezone,tile_id,lat_ulcnr,lon_ulcnr,lat_lrcnr,lon_lrcnr,time_offset,test
0,1,bon,40.05,-88.37,SURFRAD,15,3,163,195,213.0,America/Chicago,h15v03,42.0,-90.0,36.0,-84.0,-6.0,0
1,2,fpk,48.31,-105.1,SURFRAD,12,1,290,568,623.3125,America/Denver,h12v01,54.0,-108.0,48.0,-102.0,-7.0,0
2,3,gwn,34.25,-89.87,SURFRAD,15,4,13,175,101.0625,America/Chicago,h15v04,36.0,-90.0,30.0,-84.0,-6.0,0
3,4,dra,36.62,-116.02,SURFRAD,10,3,398,538,998.0625,America/Los_Angeles,h10v03,42.0,-120.0,36.0,-114.0,-8.0,0
4,5,psu,40.72,-77.93,SURFRAD,17,3,7,128,375.5625,America/New_York,h17v03,42.0,-78.0,36.0,-72.0,-5.0,0
5,6,sxf,43.73,-96.62,SURFRAD,13,2,538,427,476.3125,America/Chicago,h13v02,48.0,-102.0,42.0,-96.0,-7.0,1
6,7,tbl,40.12,-105.24,SURFRAD,12,3,276,188,1651.5625,America/Denver,h12v03,42.0,-108.0,36.0,-102.0,-7.0,1
7,19,FLO,-27.533,-48.517,BSRN,21,14,548,353,55.0,America/Sao_Paulo,h21v14,-24.0,-54.0,-30.0,-48.0,-4.0,0
8,31,LRC,37.104,-76.387,BSRN,17,3,161,489,4.25,America/New_York,h17v03,42.0,-78.0,36.0,-72.0,-5.0,1
9,8,ASP,-23.798,133.888,BSRN,52,13,188,579,548.0625,Australia/Darwin,h52v13,-18.0,132.0,-24.0,138.0,9.0,0


In [6]:
df=pd.DataFrame()

year=2018
i=1
for index, row in sitelist.iterrows():
    id=row['id']
    print(id)

    df_insitu=pd.read_pickle(insitu_path+str(id)+'.pkl')
    df[str(i)]=df_insitu.swin
    i=i+1



1
2
3
4
5
6
7
19
31
8
14
18
21
27
28
30
33
37
41


In [10]:
df.to_csv(data_path+'/solar_insitu.csv', index=False)

In [20]:
swin=df['1'].to_numpy()
swin2=swin[np.newaxis, ...] 
print(swin2.shape)

(1, 525600)


In [13]:
sitelist.id=np.arange(1,20)
sitelist

Unnamed: 0,id,name,lats,lons,network,iH,iV,fLine,fCol,elev,timezone,tile_id,lat_ulcnr,lon_ulcnr,lat_lrcnr,lon_lrcnr,time_offset,test
0,1,bon,40.05,-88.37,SURFRAD,15,3,163,195,213.0,America/Chicago,h15v03,42.0,-90.0,36.0,-84.0,-6.0,0
1,2,fpk,48.31,-105.1,SURFRAD,12,1,290,568,623.3125,America/Denver,h12v01,54.0,-108.0,48.0,-102.0,-7.0,0
2,3,gwn,34.25,-89.87,SURFRAD,15,4,13,175,101.0625,America/Chicago,h15v04,36.0,-90.0,30.0,-84.0,-6.0,0
3,4,dra,36.62,-116.02,SURFRAD,10,3,398,538,998.0625,America/Los_Angeles,h10v03,42.0,-120.0,36.0,-114.0,-8.0,0
4,5,psu,40.72,-77.93,SURFRAD,17,3,7,128,375.5625,America/New_York,h17v03,42.0,-78.0,36.0,-72.0,-5.0,0
5,6,sxf,43.73,-96.62,SURFRAD,13,2,538,427,476.3125,America/Chicago,h13v02,48.0,-102.0,42.0,-96.0,-7.0,1
6,7,tbl,40.12,-105.24,SURFRAD,12,3,276,188,1651.5625,America/Denver,h12v03,42.0,-108.0,36.0,-102.0,-7.0,1
7,8,FLO,-27.533,-48.517,BSRN,21,14,548,353,55.0,America/Sao_Paulo,h21v14,-24.0,-54.0,-30.0,-48.0,-4.0,0
8,9,LRC,37.104,-76.387,BSRN,17,3,161,489,4.25,America/New_York,h17v03,42.0,-78.0,36.0,-72.0,-5.0,1
9,10,ASP,-23.798,133.888,BSRN,52,13,188,579,548.0625,Australia/Darwin,h52v13,-18.0,132.0,-24.0,138.0,9.0,0


In [14]:
sitelist.to_csv(data_path+'/solar_sitelist.csv',index=False)

In [16]:
sitelist

Unnamed: 0,id,name,lats,lons,network,iH,iV,fLine,fCol,elev,timezone,tile_id,lat_ulcnr,lon_ulcnr,lat_lrcnr,lon_lrcnr,time_offset,test
0,1,bon,40.05,-88.37,SURFRAD,15,3,163,195,213.0,America/Chicago,h15v03,42.0,-90.0,36.0,-84.0,-6.0,0
1,2,fpk,48.31,-105.1,SURFRAD,12,1,290,568,623.3125,America/Denver,h12v01,54.0,-108.0,48.0,-102.0,-7.0,0
2,3,gwn,34.25,-89.87,SURFRAD,15,4,13,175,101.0625,America/Chicago,h15v04,36.0,-90.0,30.0,-84.0,-6.0,0
3,4,dra,36.62,-116.02,SURFRAD,10,3,398,538,998.0625,America/Los_Angeles,h10v03,42.0,-120.0,36.0,-114.0,-8.0,0
4,5,psu,40.72,-77.93,SURFRAD,17,3,7,128,375.5625,America/New_York,h17v03,42.0,-78.0,36.0,-72.0,-5.0,0
5,6,sxf,43.73,-96.62,SURFRAD,13,2,538,427,476.3125,America/Chicago,h13v02,48.0,-102.0,42.0,-96.0,-7.0,1
6,7,tbl,40.12,-105.24,SURFRAD,12,3,276,188,1651.5625,America/Denver,h12v03,42.0,-108.0,36.0,-102.0,-7.0,1
7,8,FLO,-27.533,-48.517,BSRN,21,14,548,353,55.0,America/Sao_Paulo,h21v14,-24.0,-54.0,-30.0,-48.0,-4.0,0
8,9,LRC,37.104,-76.387,BSRN,17,3,161,489,4.25,America/New_York,h17v03,42.0,-78.0,36.0,-72.0,-5.0,1
9,10,ASP,-23.798,133.888,BSRN,52,13,188,579,548.0625,Australia/Darwin,h52v13,-18.0,132.0,-24.0,138.0,9.0,0
