In [1]:
import os
import pandas as pd
import datetime
import numpy as np
import netCDF4 as nc
import xarray as xr
from tqdm import tqdm

from configs import path_config, db_config
from funcs import data_io, db_io, nc_ops
from funcs.grib_support import calc_haversine_dist

ModuleNotFoundError: No module named 'funcs.grib_support'

In [None]:
def read_data_from_nc(nc_file_path, location_dict):
    
    # OPEN XARAY
    ds = xr.open_dataset(nc_file_path)

    # XARRAY DF
    xarray_df = ds.to_dataframe().reset_index(drop=False)

    # GET AVAIL LOCS
    locations_dict_nc = get_closest_nc_distances(locations_dict=location_dict, 
                                                 xarray_df=xarray_df)

    file_df = pd.DataFrame()

    for location_name, location_meta in locations_dict_nc.items():
        
        actual_site_loc = location_meta['SITE']
        nc_site_loc = location_meta['NC']

        try:
            loc_df = xarray_df[(xarray_df['XLAT'] == nc_site_loc[0]) & 
                              (xarray_df['XLONG'] == nc_site_loc[1])].copy().reset_index(drop=True)
            loc_df['site_name'] = location_name
            loc_df['site_lat'] = actual_site_loc[0]
            loc_df['site_lon'] = actual_site_loc[0]

        except Exception as e:
            print(location, e)

        finally:
            file_df = pd.concat([file_df, loc_df], axis=0)
    
    # OUTPUT FORMAT
    file_df.drop(['x', 'y'], axis=1, inplace=True)

    #file_df.set_index('Times', inplace=True)
    return file_df, locations_dict_nc

In [None]:
def get_closest_nc_distances(locations_dict, xarray_df):

    lat_lon_df = xarray_df[['XLAT', 'XLONG']].drop_duplicates().reset_index(drop=True).copy()
    avail_lats = lat_lon_df['XLAT'].values
    avail_lons = lat_lon_df['XLONG'].values
    output_dict = {}

    for location in locations:
        site_actual_location = locations[location]
        distances = {}
        for t1, n1 in zip(avail_lats, avail_lons):
            dist = calc_haversine_dist(loc1=site_actual_location,
                                       loc2=(t1, n1))
            distances[(t1, n1)] = dist
        min_dist_at = min(distances, key=distances.get)
        output_dict[location] = {'NC': min_dist_at, 'SITE': site_actual_location}
    return output_dict

In [14]:
def make_timelike_from_float (timefloats):
    timepoints = []
    for timeval in timefloats:
        timeval += 5/24 + 30/(60*24) #IST
        year, timeval = int (timeval // 10000), timeval % 10000
        month, timeval = int (timeval // 100), timeval % 100
        day, timeval = int (timeval // 1), timeval % 1
        timeval = round (timeval *3600 * 24)
        hour, timeval = int (timeval // 3600), timeval % 3600
        minute, timeval = int (timeval // 60), timeval % 60
        second = int (timeval)
        timepoints.append("{:02d}-{:02d}-{:04d} {:02d}:{:02d}:{:02d}".format(day, month, year, hour, minute, second))
    
    timepoints = [datetime.datetime.strptime(t, '%d-%m-%Y %H:%M:%S') for t in timepoints]
    return timepoints

In [2]:
locations = data_io.get_locations_by_type(filedir=path_config.resources_path,
                                          filename='site_locations.csv')

In [4]:
# MAIN
nc_data = {}

folders_in_wrf = sorted([fold for fold in os.listdir(path_config.data_path) if 
                         os.path.isdir(os.path.join(path_config.data_path, fold))])

for date_folder in tqdm(folders_in_wrf):
    # GET DATE FOLDER IN WRF FOLDER
    fold_path = os.path.join(path_config.data_path, date_folder)

    # GET FILES IN DATE FOLDER
    files_in_fold = sorted([file for file in os.listdir(fold_path) 
                            if 
                            os.path.isfile(os.path.join(fold_path, file)) 
                            and 'd' not in file 
                            and '.DS' not in file])

    # LOOP OVER ALL FILES IN FOLDER
    for file in (files_in_fold):
        file_path = os.path.join(fold_path, file)
        file_date = file.split('.nc')[0]
        file_data, loc_dict_nc = nc_ops.read_data_from_nc(nc_file_path=file_path,
                                                   location_dict=locations)
        file_data = file_data.reset_index(drop=True)
        file_data['file_date'] = file.split('.nc')[0]
        file_data['folder_date'] = date_folder
        
        nc_data[file_date] = file_data

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:41<00:00, 20.33s/it]


In [5]:
nc_df = pd.DataFrame()
for init_date in sorted(list(nc_data.keys())):
    dat = nc_data[init_date].copy()
    nc_df = pd.concat([nc_df, dat], axis=0)

In [6]:
nc_df = nc_df.sort_values(by=['folder_date', 'file_date', 'site_name', 'Times'],
                         ascending=True).reset_index(drop=True)

In [7]:
nc_df.shape

(32172, 18)

In [8]:
nc_df = nc_df.drop_duplicates(subset=['Times' ,'site_name'], keep='last')

In [10]:
nc_df.shape

(12660, 18)

In [11]:
nc_df.columns = db_io.remove_special_chars_from_df_names(data_frame=nc_df)

In [16]:
# db_io.append_data_to_table(data=nc_df, db_url=db_io.tensor_aws_db1_url(), 
#                            table_name='td_wrf_stg', schema=db_config.wrf_schema)

Appending data (12660 rows) -> DB table -> td_wrf.td_wrf_stg.
Successfully appended data (12660 rows)  ->  DB table -> td_wrf.td_wrf_stg


In [11]:
db_connection = db_io.create_db_connection(dbname=db_config.dbname,
                                     host=db_config.host,
                                     port=db_config.port,
                                     user=db_config.user,
                                     password=db_config.password)

In [12]:
db_io.get_site_files_map(con=db_connection, 
                         table_name=db_config.wrf_stg_table,
                         schema=db_config.wrf_schema)

{'Sadla': ['2022-01-13',
  '2022-01-15',
  '2022-01-17',
  '2022-01-12',
  '2022-01-16',
  '2022-01-18',
  '2022-01-14'],
 'Bitta': ['2022-01-13',
  '2022-01-12',
  '2022-01-18',
  '2022-01-15',
  '2022-01-17',
  '2022-01-16',
  '2022-01-14'],
 'Lahori': ['2022-01-14',
  '2022-01-17',
  '2022-01-12',
  '2022-01-15',
  '2022-01-18',
  '2022-01-13',
  '2022-01-16'],
 'SITE4': ['2022-01-12',
  '2022-01-13',
  '2022-01-18',
  '2022-01-14',
  '2022-01-15',
  '2022-01-17',
  '2022-01-16'],
 'Kanasar': ['2022-01-13',
  '2022-01-18',
  '2022-01-16',
  '2022-01-12',
  '2022-01-17',
  '2022-01-14',
  '2022-01-15'],
 'SECI-1': ['2022-01-18',
  '2022-01-13',
  '2022-01-15',
  '2022-01-12',
  '2022-01-14',
  '2022-01-16',
  '2022-01-17'],
 'MSEDCL': ['2022-01-13',
  '2022-01-18',
  '2022-01-15',
  '2022-01-16',
  '2022-01-14',
  '2022-01-17',
  '2022-01-12'],
 'SITE2': ['2022-01-18',
  '2022-01-15',
  '2022-01-13',
  '2022-01-12',
  '2022-01-14',
  '2022-01-17',
  '2022-01-16'],
 'SITE3': ['2022-01

In [3]:
####

In [1]:
import os
import pandas as pd
from funcs.data_io import get_locations_by_type
from funcs.db_io import create_db_connection
from funcs import db_io
from funcs import nc_ops
from configs import db_config
from configs import path_config

In [2]:
read_paths = []
for path in os.listdir(path_config.data_path):
    path_def = os.path.join(path_config.data_path, path)
    if os.path.isdir(path_def):
        read_paths.append(path_def)

In [3]:
requested_locs = get_locations_by_type(filedir=path_config.resources_path,
                                           filename='site_locations.csv')

In [8]:
requested_locs

{'Lahori': (23.37, 76.25),
 'Sadla': (22.7308, 71.3685),
 'MSEDCL': (23.63123778, 69.01812444),
 'Kanasar': (27.482418, 72.093802),
 'Jhansi': (25.36943, 78.30572),
 'Bitta': (23.28, 69.08),
 'NJV_Delhi': (28.5335, 77.1996),
 'SECI-1': (23.5099, 69.1298),
 'SITE1': (16.450472, 77.866028),
 'SITE2': (16.376111, 74.773611),
 'SITE3': (23.921321, 71.186291),
 'SITE4': (24.142247, 71.280255)}

In [4]:
data_frame = pd.DataFrame()

for p in read_paths[:1]:
    dat = nc_ops.read_nc_from_folder(folder_path=p, location_dict=requested_locs)
    data_frame = pd.concat([dat, data_frame], axis=0)

In [7]:
data_frame[['Times_time', 'Times']].values

array([[Timestamp('2022-01-16 01:45:00'), 20220115.84375],
       [Timestamp('2022-01-16 01:30:00'), 20220115.833333332],
       [Timestamp('2022-01-16 01:15:00'), 20220115.822916668],
       ...,
       [Timestamp('2022-01-19 00:30:00'), 20220118.791666668],
       [Timestamp('2022-01-22 22:45:00'), 20220122.71875],
       [Timestamp('2022-01-22 23:30:00'), 20220122.75]], dtype=object)

In [8]:
df = data_frame.sort_values(by='init_date', ascending=True)
df = df.drop_duplicates(subset=['Times', 'site_name'], keep='last').reset_index(drop=True)

In [9]:
db_connection = create_db_connection(dbname=db_config.dbname,
                                         host=db_config.host,
                                         port=db_config.port,
                                         user=db_config.user,
                                         password=db_config.password)

In [10]:
df.columns = db_io.remove_special_chars_from_df_names(data_frame=df)

In [11]:
db_io.append_data_to_table(data=df, db_url=db_io.tensor_aws_db1_url(),
                           table_name=db_config.wrf_stg_table,
                           schema=db_config.wrf_schema,
                           verbose=True, logger_obj=None)

Appending data (12660 rows) -> DB table -> td_wrf.td_wrf_stg.
Successfully appended data (12660 rows)  ->  DB table -> td_wrf.td_wrf_stg
