In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt


#### Read csv file by pd

In [None]:
substrings = ['SID', 'SEASON', 'NUMBER', 'BASIN', 'SUBBASIN', 'NAME', 'ISO_TIME', 'BOM']

In [None]:
bom_columns = [col for col in df.columns 
              if any(sub in col for sub in substrings)]
dtype_dict = {col: str for col in bom_columns}

In [None]:
df = pd.read_csv(
    '/g/data/v45/xw6141/PhD/data/ibtracs.SP.csv',   # Replace StringIO(data) with your actual file path
    usecols=bom_columns,
    dtype=dtype_dict
    #keep_default_na=False, na_values=[" "]
)
#It is object # print(df['BOM_WIND'].dtype)

In [None]:
import pandas as pd
import numpy as np

# Example function to process each group in the DataFrame
def concat_func(x):
    try:
        # Find the index of the minimum pressure, ignoring NaN values
        min_pres_idx = np.nanargmin(x['BOM_PRES'].values)
        
        # Check if min_pres_idx is not NaN (valid index found)
        if not pd.isna(min_pres_idx):
            mpl_lat = x.iloc[min_pres_idx]['BOM_LAT']
            mpl_lon = x.iloc[min_pres_idx]['BOM_LON']
            mp_time = x.iloc[min_pres_idx]['ISO_TIME']
            mp = np.nanmin(x['BOM_PRES'])
        else:
            mpl_lat = np.nan
            mpl_lon = np.nan
            mp_time = np.nan
            mp = np.nan
        
        return pd.Series({
            'Start': x['ISO_TIME'].min(),
            'End': x['ISO_TIME'].max(),
            'StartYear': x['SEASON'].min(),
            'EndYear': x['SEASON'].max(),
            'StLAT': x.loc[x['ISO_TIME'].idxmin()]['BOM_LAT'],
            'EdLAT': x.loc[x['ISO_TIME'].idxmax()]['BOM_LAT'],
            'StLONG': x.loc[x['ISO_TIME'].idxmin()]['BOM_LON'],
            'EdLONG': x.loc[x['ISO_TIME'].idxmax()]['BOM_LON'],
            'WND': x['BOM_WIND'].max(),
            'PRS': mp,
            'MPLAT': mpl_lat,
            'MPLONG': mpl_lon,
            'MPTIME': mp_time
            # other calculations...
        })
    except ValueError:
        return pd.Series({
            'Start': np.nan,
            'End': np.nan,
            'StartYear': np.nan,
            'EndYear': np.nan,
            'StLAT': np.nan,
            'EdLAT': np.nan,
            'StLONG': np.nan,
            'EdLONG': np.nan,
            'WND': np.nan,
            'PRS': np.nan,
            'MPLAT': np.nan,
            'MPLONG': np.nan,
            'MPTIME': np.nan
            # other calculations...
        })

# Assuming df is your DataFrame and BOM_PRES needs to be numeric
df['BOM_PRES'] = pd.to_numeric(df['BOM_PRES'], errors='coerce')

# Group by SID and apply concat_func
weather = df.groupby(df['SID']).apply(concat_func).reset_index()

print(weather)

In [None]:
weather

In [None]:
def concat_func(x):
    # Find the minimum BOM_PRES value
    x = x.dropna(subset=['BOM_PRES']).replace('', np.nan)
    
    # Find rows with minimum BOM_PRES values within each SID group
    min_pres_rows = x[x['BOM_PRES'] == x['BOM_PRES'].min()]
    
    # If there are multiple rows with minimum BOM_PRES, choose the first occurrence
    min_pres_row = min_pres_rows.iloc[0] if not min_pres_rows.empty else None
    
    
    return pd.Series({
        'Start': x['ISO_TIME'].min(),
        'End': x['ISO_TIME'].max(),
        'StartYear': x['SEASON'].min(),
        'EndYear': x['SEASON'].max(),
        'StLAT': x.loc[x['ISO_TIME'].idxmin(), 'BOM_LAT'],
        'EdLAT': x.loc[x['ISO_TIME'].idxmax(), 'BOM_LAT'],
        'StLONG': x.loc[x['ISO_TIME'].idxmin(), 'BOM_LON'],
        'EdLONG': x.loc[x['ISO_TIME'].idxmax(), 'BOM_LON'],
        'WND': x['BOM_WIND'].max(),
        'PRS': min_pres_row['BOM_PRES'] if min_pres_row is not None else None,
        'MPLAT': min_pres_row['BOM_LAT'] if min_pres_row is not None else None,
        'MPLONG': min_pres_row['BOM_LON'] if min_pres_row is not None else None,
        'MPTIME': min_pres_row['ISO_TIME'] if min_pres_row is not None else None,
    })
    
weather=df.groupby(df['SID']).apply(concat_func).reset_index()

In [None]:
#df = df.dropna(subset=['BOM_PRES']).replace('', np.nan)
#a = df.groupby(df['SID'])['BOM_PRES'].max()

In [None]:
# look up the column name
weather_ = pd.DataFrame(weather)
output_file = '/g/data/v45/xw6141/PhD/weather_pressuremin.csv'
# Save the DataFrame to CSV
weather_.to_csv(output_file, index=False)
print(f"DataFrame saved to {output_file}")


In [None]:
df = pd.DataFrame(df)
output_file = '/g/data/v45/xw6141/PhD/weather_df.csv'
# Save the DataFrame to CSV
df.to_csv(output_file, index=False)

print(f"DataFrame saved to {output_file}")

In [None]:
column_values = weather['PRS'].tolist()

In [None]:
print(column_values)