In [1]:
import os
import sys
import glob
import numpy as np
import pandas as pd
import geopandas as gpd

from multiprocessing import Pool

import cartopy.feature as cf
import cartopy.crs as ccrs

from joblib import Parallel, delayed

from matplotlib import patches
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

## Export CPTEC rain gauges to clear directory

In [None]:
year = 2023

Npros = 15

In [3]:
SA_lon_min, SA_lon_max, SA_lat_min, SA_lat_max = -83,-33,-55,6

In [4]:
dir_base = os.path.join('/','media','arturo','Arturo','Data','Brazil','OBS_CPTEC','1hr')

In [5]:
list_files = glob.glob(os.path.join(dir_base,str(year),'raw','*.txt'))
list_files = sorted(list_files)
list_len = len(list_files)
print(f'Number of files: {list_len}')

Number of files: 12


In [6]:
filename = os.path.join(dir_base,list_files[-1])
DATA = pd.read_csv(filename,skiprows=2,skipfooter=3,sep=' ',engine='python')
DATA = DATA[(DATA.lon>-83)&(DATA.lon<-33)&(DATA.lat>=-55)&(DATA.lat<=6)]
DATA.reset_index(drop=True,inplace=False)
DATA.head(3)

Unnamed: 0,code,lon,lat,alt,date,r
0,14540000,-60.4706,4.6306,0,2024120100,0.0
1,140070401A,-60.163,4.5945,-9999,2024120100,0.0
2,14530000,-60.7939,4.1961,0,2024120100,0.0


In [7]:
code_list = DATA['code'].values 
code_unique = np.unique(code_list)
print(f'Year    : {year}')
print(f'Stations: {len(code_unique)}')

Year    : 2024
Stations: 6022


In [8]:
def search_concat(code_input, DATA):
    print(f'Station: {code_input}')
    df_list = []
    
    nameout = f'obs_cptec_{code_input}.csv'
    dir_out = os.path.join(dir_base,str(year),'clear',nameout)
    
    if os.path.exists(dir_out):
        # print('File Exists')
        return
    else:
        for nn in range(list_len):
            filename = os.path.join(dir_base,list_files[nn])
            DATA = pd.read_csv(filename,skiprows=2,skipfooter=3,sep=' ',engine='python')
            DATA_ref = DATA[DATA['code']==code_input]
            if not DATA_ref.empty:
                df_list.append(DATA_ref)
    
    DATA_all = pd.concat(df_list, ignore_index=True)
    DATA_all['date'] = pd.to_datetime(DATA_all['date'].astype(str), format='%Y%m%d%H')
    
    DATA_all['lat'] = DATA_all['lat'].round(2)
    DATA_all['lon'] = DATA_all['lon'].round(2)
    
    # modulo para remover tiempos duplicados
    df_sorted = DATA_all.sort_values(by='r', ascending=False)
    df_unique = df_sorted.drop_duplicates(subset=['date'], keep='first').reset_index(drop=True)
    
    df_final = df_unique.sort_values(by='date').reset_index(drop=True)
    
    df_final.to_csv(dir_out, header=True, index=False)
    
    return DATA_all

In [None]:
def compute_for_point(args):
    code_input = args
    return search_concat(code_input, DATA)

# # It works but it is not parallelized
# with Pool(processes=Npros) as pool:
#     results = pool.map(compute_for_point, [(code_input) for code_input in code_unique])

# # It works
with Pool(processes=Npros) as pool:
    results = list(pool.imap_unordered(compute_for_point, code_unique, chunksize=1))

Station: -9999Station: 02142096Station: 02042051Station: 10Station: 1Station: 10200000
Station: 10910000Station: 10500000Station: 10100000



Station: 11
Station: 110018901A
Station: 110020501A


Station: 11400000Station: 110020502A
Station: 11500000
Station: 120Station: 120010401A
Station: 120040101H
Station: 12100000Station: 121Station: 120070801A
Station: 122
Station: 12240000

Station: 123



Station: 12370000Station: 12351000
Station: 12390000Station: 124Station: 12500000
Station: 125Station: 12520000Station: 12510500

Station: 12540000

Station: 12550000




Station: 12557000Station: 126Station: 12590000
Station: 12640000Station: 12650000Station: 12680000Station: 127Station: 128


Station: 12700000Station: 12840000





Station: 12842000Station: 12845000Station: 12850000
Station: 12880000Station: 129Station: 13Station: 12900001Station: 130

Station: 130002901A
Station: 130006001A


Station: 130008601A


Station: 130014401A
Station: 130063101AStation: 130050801AStation: 130060701A

## OLD code only for one file at a time

In [None]:
# for pos in range(len(code_unique)):
#     print(f'Station: {code_unique[pos]}')
#     df_list = []

#     nameout = f'obs_cptec_{code_unique[pos]}.csv'
#     dir_out = os.path.join(dir_base,str(year),'clear',nameout)

#     if os.path.exists(dir_out):
#             continue
#     else:
#         for nn in range(list_len):
#             filename = os.path.join(dir_base,list_files[nn])
#             DATA = pd.read_csv(filename,skiprows=2,skipfooter=3,sep=' ',engine='python')
#             DATA_ref = DATA[DATA['code']==code_unique[pos]]
#             if not DATA_ref.empty:
#                 df_list.append(DATA_ref)

#     DATA_all = pd.concat(df_list, ignore_index=True)
#     DATA_all['date'] = pd.to_datetime(DATA_all['date'].astype(str), format='%Y%m%d%H')
#     DATA_all.to_csv(dir_out, header=True, index=False)