In [1]:
import pandas as pd
import numpy as np
import math
from datetime import datetime, timedelta
from plotnine import *
pd.options.mode.chained_assignment = None  # default='warn'
import pyproj
import warnings
warnings.filterwarnings("ignore")

# Investigating Rapid Intensification Events in Atlantic Storms

## What is <span style="color:darkred">Rapid Intensification?</span>

In meteorology, rapid intensification is a situation where a tropical cyclone intensifies dramatically in a short period of time. The National Hurricane Center defines rapid intensification as an **increase in the maximum sustained winds of at least 30 knots in a 24-hour period.**

[From Wiki] In order for rapid intensification to occur, several conditions must be in place. **Water temperatures must be extremely warm** (near or above 30 °C, 86 °F), and water of this temperature must be sufficiently deep such that waves do not churn deeper cooler waters up to the surface. **Wind shear must be low;** when wind shear is high, the convection and circulation in the cyclone will be disrupted. Dry air can also limit the strengthening of tropical cyclones.

### Rapid Deepening

[From Wiki] The NHC previously defined **rapid deepening** of a tropical cyclone, when the minimum central pressure decreased by 42 millibars (1.240 inHg) over a 24-hour period. Recent research suggests that mean sea level pressure is a better predictor of damage from hurricanes making landfall in the continental United States.

## Part 1: Data Acquisition, Cleaning, and Indicator Creation

### Note 1: The format of the NHC HURDAT2.csv file
The format of the HURDAT file looks like this:

Storm Idenifier Row
Strom Observation Row
Strom Observation Row
Strom Observation Row
Strom Observation Row

Within in an observation row, there is nothing that can id the storm that this row belongs too, this we need to add the storm id code to each row for ease of use later. See an example of the unchanged data from one storm below:

AL092011,  IRENE,  39,  
20110821, 0000,  , TS, 15.0N,  59.0W,  45, 1006,  105, 0, 0,45, 0, 0, 0, 0, 0, 0, 0, 0,  
20110821, 0600,  , TS, 16.0N,  60.6W,  45, 1006,  130, 0, 0,80, 0, 0, 0, 0, 0, 0, 0, 0,  
20110821, 1200,  , TS, 16.8N,  62.2W,  45, 1005,  130, 0, 0,70, 0, 0, 0, 0, 0, 0, 0, 0,  
20110821, 1800,  , TS, 17.5N,  63.7W,  50,  999,  130,20, 0,70,30, 0, 0, 0, 0, 0, 0, 0,  
20110822, 0000,  , TS, 17.9N,  65.0W,  60,  993,  130,30,30,90,30, 0, 0,30, 0, 0, 0, 0,  
20110822, 0600,  , HU, 18.2N,  65.9W,  65,  990,  130,60,60,90,40,25,20,35,25, 0, 0, 0,  
20110822, 1200,  , HU, 18.9N,  67.0W,  70,  989,  160,60,60,90,40,25,20,35,25, 0, 0, 0,  
20110822, 1800,  , HU, 19.3N,  68.0W,  75,  988,  160,60,40,90,40,30,20,35,25, 0, 0, 0, 

To learn more about the format of the HURDAT2 file, see the description at https://www.nhc.noaa.gov/data/hurdat/hurdat2-format-nov2019.pdf

### Read in data from official HURDAT2.csv file

In [39]:
def read_hurdat_lines():
    """
    read in official NHC HURDAT2.csv data file as line oject; 
    args: none; returns: line object
    """
    f = open("../data/HURDAT2.csv", "r")
    lines = f.readlines()
    f.close()
    return lines

In [40]:
def hurdat_lines_to_df(lines):
    """
    convert HURDAT lines object to dataframe, accounting for necessary formating of HURDAT data file, see note (1);
    args: lines object; returns: df with all storm observations, in tidy format(?)
    """
    hurdat=[] # to store all observations as nested list
    storm_info=[] # to store name and storm code
    df = pd.DataFrame()
    for line in lines:
        arr = line.split(",")
        # If this is a new storm, it will have "AL" in the first item ('AL' for Atlantic stroms)
        # Since this is a new storm, we need to update storm info and not add this 'observation' to list
        if "AL" in arr[0]: 
            storm_info = [arr[0],arr[1].strip()]
        # If this is the same storm as previous row, add new observation to list
        else:
            arr.insert(0,storm_info[0])
            arr.insert(1,storm_info[1])
            hurdat.append(arr) 
    df = pd.DataFrame(hurdat)
    return df

### Prettify the Dataframe (Cut, Rename, Strip, Retype)

In [41]:
def cut_and_rename_columns(df):
    """ rename columns from HURDAT file, see note (1); returns given dataframe """
    col_names = {
        df.columns[0]: 'Code',df.columns[1]: 'Name',  df.columns[2]: 'Date',
        df.columns[3]: 'Time', df.columns[4]: 'Record',df.columns[5]: 'Status',
        df.columns[6]: 'Lat',  df.columns[7]: 'Lon',   df.columns[8]: 'Wind',
        df.columns[9]: 'Pressure'
    }
    df = df.rename(columns = col_names) # rename columns according to dictionary
    return df

In [42]:
def strip_string_columns(df):
    """ Strip extra spaces on all object columns from raw HURDAT file; returns given dataframe """
    df_obj = df.select_dtypes(['object'])
    df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip()) # strip spaces from string columns
    return df

In [43]:
def create_datetime_column(df):
    """
    HURDAT data has date and time in seperate columns, but we combine into one DateTime;
    removes seperate date and time columns; 
    returns given dataframe
    """
    df["DateTime"] = df["Date"] + ' ' + df["Time"] # combine string columns
    df["DateTime"] = pd.to_datetime(df["DateTime"], format = '%Y%m%d %H%M') # convert to datetime object
    df = df.drop(columns=['Date', 'Time'], axis=1) # remove unneeded columns
    return df

### Convert coordinates from string (19.7W) to float (-19.7)

In [44]:
def convert_coordinates(df):
    """
    convert HURDAT string coordinates to something that visualizations can understand (float between -180 and 180); 
    string coords have number and direction, we split and multiply by -1 depending on hemisphere;
    returns given dataframe
    """
    for direc in ['Lat','Lon']: # loop for both coordinates
        df[f'{direc}_Hemisphere'] = df[f'{direc}'].str[-1:] # get the direction string (N,E,S,W)
        df[f'{direc}'] = df[f'{direc}'].str[:-1].astype(float) # get numeric value
        # function to multiply value depending on direction string
        convert_direc = lambda row: row[f"{direc}"]*-1 if row[f"{direc}_Hemisphere"] in ['S','W'] else row[f"{direc}"]
        df[f'{direc}'] = df.apply(convert_direc, axis=1) # apply lambda func to get final readable coordinate
        df = df.drop(columns=[f"{direc}_Hemisphere"]) # remove unneeded column
    return df

### Create 'Rapid_NHC24' column denoting if storm was 'Rapid Increasing' (per NHC definition)

In [45]:
def within_24_hours(time, series):
    """
    get items in series whose datetime is within 24 hours of argument time; 
    args: time (datetime), series (series) containing HURDAT observations;
    return: series with items whose datetime is within 24 hours of the argument time
    """
    return (time-timedelta(hours=24) <= series) & (series <= time) 

In [12]:
def get_progress(i,n,t):
    """ 
    print progress of loops to make sure they are running, not for use in final project;
    args: i: current position; n: total positions; t: how often to print progress (10 = print progress 10 times)
    """
    if i%(n//t) == 0: 
        print(round(i/n,1))

In [13]:
def create_rapid_NHC_column(df):
    """
    create boolean column if the storm was 'Rapidly Increasing' (per NHC defn.) at the time of the observation;
    recall NHC says RI is an increase of 30+ knots in 24 hours;
    returns given dataframe with boolean column 'Rapid_NHC24'
    """
    df["Rapid_NHC24"] = False
    storm_codes = df.Code.unique() # find all storm codes
    # loop through storms, getting each as seperate dataframe
    for index_storm, storm in enumerate(storm_codes): 
        # get_progress(index_storm,len(storm_codes),10) # print progress 
        df_storm = df[df.Code == storm] # filter for storm code
        for index_row, row in df_storm.iterrows(): # for each row in storm dataframe
            current_time = row['DateTime'];current_wind = row["Wind"] # get current weather values
            df_24_hours = df_storm[ within_24_hours(current_time, df_storm['DateTime']) ] # get all rows within 24 hours
            min_wind_24_hours = df_24_hours["Wind"].min() # find minimum wind speed within 24 hours
            # apply boolean if current wind speed is 30mph greater than the minimum in 24 hours
            df.loc[index_row,"Rapid_NHC24"] = (current_wind - min_wind_24_hours ) > 30
    return df

### Create 'Rapid_NHC{k}' column denoting if storm was Rapid Increasing, defined as an increase of 30 knots in Maximum Sustained Windspeed over the previous k hours

In [14]:
def within_k_hours(time, series, k):
    """
    get items in series whose datetime is within k hours of argument time;
    args: time (datetime), series (series) of HURDAT observations;
    return: series with items whose datetime is within 24 hours of the argument time
    """
    return (time-timedelta(hours=k) <= series) & (series <= time) 

In [15]:
def create_k_rapid_NHC_column(df, K):
    """
    exactly the same as 'create_rapid_NHC_column' but with variable amount of hours instead of only 24; 
    create boolean columns if storm was 'Rapidly Increasing' per modified NHC defn. at the time of the observation;
    we define modified defn. as RI if there is an increase of 30+ knots in k hours;
    args: df of storm observations, K (list) int list to create modified RI columns for each k hours in K
    """
    storm_codes = df.Code.unique() # list of unique stroms
    for index_storm, storm in enumerate(storm_codes): # iterate through all stroms 
        #get_progress(index_storm,len(storm_codes),10)
        df_storm = df[df.Code == storm] # get df for each storm
        for k in K: # iterate through multiple new modified RI columns
            for index_row, row in df_storm.iterrows():
                current_time = row['DateTime']; current_wind = row["Wind"] # get current time/wind for one obs.
                # find all obs. within k hours of current observation
                df_k_hours = df_storm[ within_k_hours(current_time, df_storm['DateTime'], k) ]
                min_wind_k_hours = df_k_hours["Wind"].min() # get minimum wind within k hours
                # determine if strom was RI if it increased by 30mph in the last k hours
                df.loc[index_row,f"Rapid_NHC{k}"] = (current_wind - min_wind_k_hours ) > 30 
    return df

### Acceleration

In [16]:
def difference_in_hours(dt1,dt2):
    """
    calculate the difference between two datetimes in hours, copied from stack; 
    args: dt1 (datetime), dt2 (datetime); returns: hours between dt1 and dt2;
    """
    difference = dt1 - dt2 # get difference in datetime format
    days, seconds = difference.days, difference.seconds # extract days and seconds bc hours is not native(??)
    hours = days * 24 + seconds // 3600 # calculate hours
    return hours

In [17]:
def calculate_acceleration(delta_wind, delta_time):
    """
    calculate the acceleration per hour for a single strom interval;
    args: 
        delta_wind (float): the change in wind in an interval
        delta_time (float): the length in hours of the interval
    returns: acceleration per hour (float) of wind during the interval
    """
    try:
        accel = delta_wind/delta_time
    except:
        # if delta_time = 0, let acceleration = 0
        accel = 0
    # 25 mph change in one hour is far too high to be plausible, so return 0 because something went wrong
    return accel if abs(accel) < 25 else 0

In [18]:
def create_acceleration_column(df):
    """
    storm observations usually come every 6 hours, though not always. this method calculates the change in 
    acceleration since the last observation for that storm.
    returns the given dataframe, with a new column for accleration
    """
    df[f"Accel"] = 0 # set default acceleration to 0
    storm_codes = df.Code.unique() # get all unique storms
    # iterate through each storm code, create a dataframe for each strom
    for index_storm, storm in enumerate(storm_codes):
        #get_progress(index_storm,len(storm_codes),10)
        df_storm = df[df.Code == storm] # get strom specific df
        current_wind = previous_wind = 0 # set default values to 0
        first_index = np.inf; # reset index to high number
        # loop through each row of the storm df
        for index_row, row in df_storm.iterrows():
            # record index of first row in storm df, to get assigned 0 acceleration later
            if index_row < first_index: first_index = index_row
            try:
                # get current weather values
                current_wind = df.iloc[index_row]['Wind']
                current_time = df.iloc[index_row]['DateTime']
                
                # get weather values for previous row
                previous_wind = df.iloc[index_row-1]['Wind']
                previous_time = df.iloc[index_row-1]['DateTime']
                
                # calculate change in time and wind, used to calculate accleration
                delta_wind = current_wind - previous_wind
                delta_time = difference_in_hours(current_time,previous_time)
                acceleration = calculate_acceleration(delta_wind,delta_time)
            except:
                acceleration = 0 # if any of the above failed, then set acceleration to 0
            df.loc[index_row,'Accel'] = acceleration # set acceleration in full dataframe
        df.loc[first_index,'Accel'] = 0 # set acceleration of first row in storm df to 0
        first_index = np.inf; # reset index of first row
    return df

### Calculate radii of high intensity

In [19]:
def calculate_wind_radii(df):
    """
    calculate the average radius of different wind speed by average the wind extent in the four quadrants;
    HURDAT data has extent of 34,50,64kt winds in 4 quadrants (NW,SW,SE,NE), we average them to just 3 numbers;
    returns given dataframe, with 3 new columns for extent of 34,50,64kt winds
    """
    # get average of 4 quadrant observations for 3 wind categories, and record as single column
    df["34kt_radius"] = df[[10,11,12,13]].astype(str).astype(int).mean(axis=1) 
    df["50kt_radius"] = df[[14,15,16,17]].astype(str).astype(int).mean(axis=1)
    df["64kt_radius"] = df[[18,19,20,21]].astype(str).astype(int).mean(axis=1)
    df.drop(columns = [10,11,12,13,14,15,16,17,18,19,20,21,22], inplace=True) # drop unneeded columns
    return df

### Remove storms that do not reach Category X

In [20]:
def remove_storms_below_cat_X(df, X):
    """
    remove storms from the full dataset if the storm did not reach category X;
    args: df: full strom df, X (int): remove stroms if they do not meet category X
    """
    if X not in range(1,6): return df # if not a valid storm category, retrun the full df
    cat_X_minimum_wind_speeds = {1:64 , 2:83, 3:96, 4:113, 5:137} # minimum wind speed for each category hurricane
    # compare max wind speed of strom with desired category miniumum, remove storm if it does not reach threshold
    df = df[(df.groupby('Code')['Wind'].transform('max')) >= cat_X_minimum_wind_speeds[X]]
    return df

### Calculate bearing and distance between two observations

In [52]:
def create_bearing_column(df):
    geodesic = pyproj.Geod(ellps='WGS84')
    df[f"Bearing"] = 0 # set default bearing to 0
    df[f"Distance"] = 0 # set default bearing to 0
    df[f"Speed"] = 0 # set default bearing to 0
    storm_codes = df.Code.unique() # get all unique storms
    # iterate through each storm code, create a dataframe for each strom
    for index_storm, storm in enumerate(storm_codes):
        #get_progress(index_storm,len(storm_codes),10)
        df_storm = df[df.Code == storm] # get strom specific df
        current_bearing = previous_bearing = 0 # set default values to 0
        first_index = np.inf; # reset index to high number
        # loop through each row of the storm df
        for index_row, row in df_storm.iterrows():
            # record index of first row in storm df, to get assigned 0 acceleration later
            if index_row < first_index: first_index = index_row
            try:
                current_time = df.iloc[index_row]['DateTime']
                previous_time = df.iloc[index_row-1]['DateTime']
                delta_time = difference_in_hours(current_time,previous_time)
                
                # get current weather values
                current_position = [df.iloc[index_row]['Lat'],df.iloc[index_row]['Lon']]
                
                # get weather values for previous row
                previous_position = [df.iloc[index_row-1]['Lat'],df.iloc[index_row-1]['Lon']]
                
                fwd_bearing, back_bearing, distance = geodesic.inv(
                    previous_position[1], previous_position[0], current_position[1], current_position[0])
                distance = distance/1609.34
                speed = distance / delta_time
            except:
                fwd_bearing = 0
                distance = 0
                speed = 0
            df.loc[index_row,'Bearing'] = fwd_bearing
            df.loc[index_row,'Distance'] = distance
            df.loc[index_row,'Speed'] = speed
        df.loc[first_index,'Bearing'] = 0
        df.loc[first_index,'Distance'] = 0
        df.loc[first_index,'Speed'] = 0
        first_index = np.inf; # reset index of first row
    return df

# Main

In [53]:
lines = read_hurdat_lines()
df = hurdat_lines_to_df(lines)

In [54]:
df = cut_and_rename_columns(df)
df = strip_string_columns(df)
df = create_datetime_column(df)
df[['Wind','Pressure']] = df[['Wind','Pressure']].astype(str).astype(int)

df = convert_coordinates(df)
df = calculate_wind_radii(df)
df = create_bearing_column(df)

df = remove_storms_below_cat_X(df,1)
df = df.reset_index(drop=True)

In [55]:
df = create_k_rapid_NHC_column(df,[6,12,18,24])
df = create_acceleration_column(df)

In [56]:
df.to_pickle("11_18_21.pkl")

In [2]:
df = pd.read_pickle("11_18_21.pkl")

In [4]:
df[df.Code == 'AL092017']

Unnamed: 0,Code,Name,Record,Status,Lat,Lon,Wind,Pressure,DateTime,34kt_radius,50kt_radius,64kt_radius,Bearing,Distance,Speed,Rapid_NHC6,Rapid_NHC12,Rapid_NHC18,Rapid_NHC24,Accel
32970,AL092017,HARVEY,,LO,13.7,-45.8,25,1013,2017-08-16 06:00:00,0.0,0.0,0.0,0.000000,0.000000,0.000000,False,False,False,False,0.000000
32971,AL092017,HARVEY,,LO,13.7,-47.4,25,1010,2017-08-16 12:00:00,0.0,0.0,0.0,-89.810518,107.544676,17.924113,False,False,False,False,0.000000
32972,AL092017,HARVEY,,LO,13.6,-49.0,25,1009,2017-08-16 18:00:00,0.0,0.0,0.0,-93.467749,107.786809,17.964468,False,False,False,False,0.000000
32973,AL092017,HARVEY,,LO,13.6,-50.6,25,1010,2017-08-17 00:00:00,0.0,0.0,0.0,-89.811875,107.589983,17.931664,False,False,False,False,0.000000
32974,AL092017,HARVEY,,TD,13.4,-52.0,25,1008,2017-08-17 06:00:00,0.0,0.0,0.0,-98.141887,95.178876,15.863146,False,False,False,False,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33039,AL092017,HARVEY,,EX,36.0,-87.1,25,1002,2017-09-01 12:00:00,0.0,0.0,0.0,38.974537,97.985267,16.330878,False,False,False,False,-0.833333
33040,AL092017,HARVEY,,EX,36.5,-86.4,20,1004,2017-09-01 18:00:00,0.0,0.0,0.0,48.385803,52.122835,8.687139,False,False,False,False,-0.833333
33041,AL092017,HARVEY,,EX,37.2,-85.6,20,1007,2017-09-02 00:00:00,0.0,0.0,0.0,42.327297,65.539447,10.923241,False,False,False,False,0.000000
33042,AL092017,HARVEY,,EX,37.9,-84.9,20,1009,2017-09-02 06:00:00,0.0,0.0,0.0,38.313317,61.707229,10.284538,False,False,False,False,0.000000


### Write to .txt file for R visualization

In [58]:
storm_codes = df.Code.unique()
f = open('R_viz.csv',"w")
f.write(f"Code,Name,Year,Month,Hour,Lat1,Lon1,Lat2,Lon2,Wind,Pressure,Accel,Bearing,Speed,Distance,Rapid_NHC24,Rapid_NHC18,Rapid_NHC12,Rapid_NHC6,kt34,kt50,kt64,\n")
for storm_index, storm in enumerate(storm_codes):
    # get_progress(storm_index,len(storm_codes),10) 
    df_storm = df[df.Code == storm ]
    for i in range(len(df_storm)-1):
        Code = df_storm.iloc[i,:].Code
        Name = df_storm.iloc[i,:].Name
        Year = df_storm.iloc[i,:].DateTime.year
        Month = df_storm.iloc[i,:].DateTime.month
        Hour = df_storm.iloc[i,:].DateTime.hour
        Lat1 = df_storm.iloc[i,:].Lat
        Lon1 = df_storm.iloc[i,:].Lon
        Lat2 = df_storm.iloc[i+1,:].Lat
        Lon2 = df_storm.iloc[i+1,:].Lon
        Wind = df_storm.iloc[i,:].Wind
        Pressure = df_storm.iloc[i,:].Pressure
        Accel = df_storm.iloc[i,:].Accel
        Bearing = df_storm.iloc[i,:].Bearing
        Speed = df_storm.iloc[i,:].Speed
        Distance = df_storm.iloc[i,:].Distance
        Rapid_NHC24 = df_storm.iloc[i,:].Rapid_NHC24
        Rapid_NHC18 = df_storm.iloc[i,:].Rapid_NHC18
        Rapid_NHC12 = df_storm.iloc[i,:].Rapid_NHC12
        Rapid_NHC6 = df_storm.iloc[i,:].Rapid_NHC6
        kt34 = df_storm.iloc[i,:]['34kt_radius']
        kt50 = df_storm.iloc[i,:]['50kt_radius']
        kt64 = df_storm.iloc[i,:]['64kt_radius']
        f.write(f"{Code},{Name},{Year},{Month},{Hour},{Lat1},{Lon1},{Lat2},{Lon2},{Wind},{Pressure},{Accel},{Bearing},{Speed},{Distance},{Rapid_NHC24},{Rapid_NHC18},{Rapid_NHC12},{Rapid_NHC6},{kt34},{kt50},{kt64},\n")
f.close()