# Analysis of caribbean hurricanes
Inspired by https://arxiv.org/abs/1802.02548 i also wanted to try to predict the tracks of hurricanes based on data of past storms.
I found this similar looking dataset and will try to do similar predictions but experiment with different methods (and not start with RNN's).

In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import copy
warnings.simplefilter("ignore")
plt.style.use("bmh")

In [None]:
df = pd.read_csv("/kaggle/input/hurricane-database/atlantic.csv")

In [None]:
df["Time"] = df["Time"].astype("object")
time_replace = [str(x) for x in df["Time"].unique()]
for i, txt in enumerate(time_replace):
    time_replace[i] = txt.rjust(4, "0")
    time_replace[i] = f"{time_replace[i][0:2]}:{time_replace[i][2:4]}:00"
for old, new in zip(df["Time"].unique(), time_replace):
    df.loc[df["Time"]==old, "Time"] = new
#df["Time"].unique() 

In [None]:
df["Date"] = df["Date"].astype("object")
for i, date_str in enumerate(df["Date"].unique()):
    df.loc[df["Date"]==date_str, "Date"] = f"{str(date_str)[0:4]}-{str(date_str)[4:6]}-{str(date_str)[6:]}"

In [None]:
df["Datetime"] = df["Date"]+" "+df["Time"]
df["Datetime"] = pd.to_datetime(df["Datetime"])
df.drop(columns=["Date", "Time"], inplace=True)
df.sort_values(by=["Datetime"], inplace=True)

In [None]:
df["Name"] = df["Name"].str.strip()
df["Status"] = df["Status"].str.strip()
df["Event"] = df["Event"].str.strip()

In [None]:
print(f"Dataset contains data of {df['ID'].unique().shape[0]} individual storms from {df['Datetime'].dt.year.min()} to {df['Datetime'].dt.year.max()}.")

In [None]:
def coordinate_mapping(x):
    coord = float(x[:-1])
    if x[-1]=="W":
        coord *= -1
    if x[-1]=="S":
        coord *= -1
    return coord

In [None]:
df["Latitude"] = df["Latitude"].apply(coordinate_mapping)
df["Longitude"] = df["Longitude"].apply(coordinate_mapping)

In [None]:
print("Min. Long.:", df.Longitude.min(), "Max. Long.:", df.Longitude.max(), "Min. Lat.:", df.Latitude.min(), "Max. Lat.:", df.Latitude.max())

A minimum longitude of -359.1 makes no sense as the scale is between 180 und -180, so we have to rotate these values.

In [None]:
df.loc[df.Longitude<-180, "Longitude"] = df.Longitude+360

In [None]:
gdf = gpd.GeoDataFrame(df,geometry=gpd.points_from_xy(df.Longitude,df.Latitude), crs={'init' :'epsg:4326'})

In [None]:
gdf.crs

In [None]:
world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))

In [None]:
ax = world.plot(color="white", edgecolor="black", figsize=(25,12))
_ = gdf[gdf["ID"]=="AL092004"].plot(ax=ax, c="r", markersize="Maximum Wind", alpha=0.5)
_ = plt.xlim(-120, 20)
_ = plt.ylim(-10, 60)
_ = plt.title(f'{gdf[gdf["ID"]=="AL092004"].Name.unique()[0]} ({gdf[gdf["ID"]=="AL092004"].Datetime.dt.year.unique()[0]})')

To make predictions i'll construct a simplified coordinate system where the latitude is the x axis and longitude is the y axis and the positions of all storms are between 0 and 1. 

In [None]:
lat_min = gdf.Latitude.min()
long_min = gdf.Longitude.min()
gdf["x"] = gdf.Latitude-lat_min
gdf["x"] = gdf["x"]/gdf["x"].max()
gdf["y"] = gdf.Longitude-long_min
gdf["y"] = gdf["y"]/gdf["y"].max()

The month and season is an important feature because this contains implicit information such as the water temperature. A continous number is much easier to work with than with months/days etc. so i'll create a feature where the first of january corresponds to 0 and the 31th of december corresponds to 1.<br>Additionally i'll create a feature "Month of year" and "Hour of day". 

In [None]:
gdf["relative_time"] = ((gdf.Datetime.dt.dayofyear+(gdf.Datetime.dt.hour/24.)+(gdf.Datetime.dt.minute/60.*24))/366.) # Each day is 1/366 of a year (WITH 31th of december), each hour is a 1/24 of a day and each minute 1/60 of a hour.

In [None]:
gdf["month"] = gdf.Datetime.dt.month
gdf["hour"] = gdf.Datetime.dt.hour

I remember from some news articles/documentaries that there is something like a "hurricane season" let's see if this is reflected in the dataset.

In [None]:
_ = gdf.plot(x="relative_time", y="Maximum Wind", kind="scatter", figsize=(20,12), title="Max. Wind vs. relative_time")
_ = plt.ylim(0,180)
_ = plt.xlim(0,1)

We'll try to predict the next position in the storm trajectory. To be able to do this we need to calculate the vectors for each positional change.

In [None]:
gdf["vec_x"] = np.nan
gdf["vec_y"] = np.nan
vecs = {"x": [], "y": [], "id": []}
for storm_id in gdf["ID"].unique():
    last_x = 0
    last_y = 0
    for i, row in gdf[gdf["ID"]==storm_id].iterrows():
        if last_x==0:
            last_x = row.x
            last_y = row.y
        else:
            vec_x = row.x-last_x
            vec_y = row.y-last_y
            vecs["x"].append(vec_x)
            vecs["y"].append(vec_y)
            vecs["id"].append(i)
            last_x = row.x
            last_y = row.y
gdf.loc[vecs["id"], "vec_x"] = vecs["x"]
gdf.loc[vecs["id"], "vec_y"] = vecs["y"]

Because we can only predict one value we'll need two models to predict the trajectory. As the independent prediction of the x and y position probably won't work i'll use one model to predict the change in direction (angle) and the change in distance (length).

In [None]:
gdf["vec_len"] = np.sqrt((gdf["vec_x"]**2)+(gdf["vec_y"]**2))

In [None]:
def calculate_direction(vec_x, vec_y):
    def vec_angle(a, b):
        return np.arccos(np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b)))
    ref_vec = np.array([vec_x, vec_y])
    N_vec = np.array([0,1])
    S_vec = np.array([0,-1])
    E_vec = np.array([1,0])
    W_vec = np.array([-1,0])
    N_angle = vec_angle(ref_vec, N_vec)
    E_angle = vec_angle(ref_vec, E_vec)
    W_angle = vec_angle(ref_vec, W_vec)
    return_angle = N_angle
    if W_angle<E_angle:
        return_angle = 2*np.pi-return_angle
    return return_angle

In [None]:
gdf["vec_direction"] = gdf.apply(lambda x: calculate_direction(x.vec_x, x.vec_y), axis=1)

In [None]:
gdf["tdelta"] = np.nan
tdeltas = {"t": [], "id": []}
for storm_id in gdf["ID"].unique():
    last_time = 0
    for i, row in gdf[gdf["ID"]==storm_id].iterrows():
        if last_time==0:
            last_time = row.Datetime
        else:
            tdeltas["t"].append((row.Datetime-last_time).seconds)
            tdeltas["id"].append(i)
            last_time = row.Datetime
gdf.loc[tdeltas["id"], "tdelta"] = tdeltas["t"]

In [None]:
gdf = gdf[gdf["tdelta"]==21600]
gdf.shape

In [None]:
gdf["prev_len"] = np.nan
gdf["prev_direction"] = np.nan
prevs = {"len": [], "direction": [], "id": []}
for storm_id in gdf["ID"].unique():
    last_len = False
    last_direction = False
    for i, row in gdf[gdf["ID"]==storm_id].iterrows():
        if last_len==False:
            last_len = row.vec_len
            last_direction = row.vec_direction
        else:
            prevs["len"].append(last_len)
            prevs["direction"].append(last_direction)
            prevs["id"].append(i)
            last_len = row.vec_len
            last_direction = row.vec_direction
gdf.loc[prevs["id"], "prev_len"] = prevs["len"]
gdf.loc[prevs["id"], "prev_direction"] = prevs["direction"]

In [None]:
gdf.dropna(how="any", inplace=True)
gdf.shape

## Prediction
Now i'll try to predict the direction and the distance of for each time step.

### Model
The hypothesis for both models (distance traveled, direction) depends on the following features:
* Maximum Wind: The highest measured windspeed
* Position (x, y): Probably the most important feature, as storms behave similary in the gulf etc. (my hope is, that the models create some kind of artifical grid, e.g. if x>0.1 and x<0.2 etc., even though that requires A LOT of splits)
* Month of year: A latent variable for the genereal climatic conditions (Temperature, Jet stream etc.)
* Hour of day: A latent variable for temperature (usually the temperature at 12am is different than at 2am etc.)
* Previous distance/direction

In [None]:
gdf_prediction_direction = gdf[["Maximum Wind", "x", "y", "month", "hour", "prev_len", "prev_direction", "vec_direction"]]
gdf_prediction_length = gdf[["Maximum Wind", "x", "y", "month", "hour", "prev_len", "prev_direction","vec_len"]]
X_direction = gdf_prediction_direction.values[:,:-1]
y_direction = gdf_prediction_direction.values[:,-1]
X_length = gdf_prediction_length.values[:,:-1]
y_length = gdf_prediction_length.values[:,-1]

In [None]:
from sklearn.model_selection import train_test_split
X_direction_train, X_direction_test, y_direction_train, y_direction_test = train_test_split(X_direction, y_direction, test_size=0.2, random_state=42)
X_length_train, X_length_test, y_length_train, y_length_test = train_test_split(X_length, y_length, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

In [None]:
model_direction = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=0, loss='ls').fit(X_direction_train, y_direction_train)
mean_squared_error(y_direction_test, model_direction.predict(X_direction_test))

In [None]:
model_length = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=0, loss='ls').fit(X_length_train, y_length_train)
mean_squared_error(y_length_test, model_length.predict(X_length_test))

**That's not bad!**<br>As the error of the direction is a multiple of PI we can calculate the mean squared error as 22.9°<br>The error of the length/distance is very low.<br><br>Let's look at the feature importances with the help of the shap module.

In [None]:
import shap

In [None]:
explainer_direction = shap.TreeExplainer(model_direction)
shap_values_direction = explainer_direction.shap_values(X_direction_test)
shap.summary_plot(shap_values_direction, X_direction_test, plot_type="bar", feature_names=["Maximum Wind", "x", "y", "month", "hour", "prev_len", "prev_direction"], title="Feature importances model_direction", plot_size=(20,10))

As expected the previous direction has a huge influence, followed by the latitude and the previous distance.

In [None]:
explainer_length = shap.TreeExplainer(model_length)
shap_values_length = explainer_length.shap_values(X_length_test)
shap.summary_plot(shap_values_length, X_length_test, plot_type="bar", feature_names=["Maximum Wind", "x", "y", "month", "hour", "prev_len", "prev_direction"], title="Feature importances model_length", plot_size=(20,10))

For the distance/length prediction the previous value is the most important one, too.

## Visualization of the results
In the last part of this notebook i'll predict the trajectory of a storm and plot the real (red) and predicted (blue) trajectories in the same plot.<br>
This involves some rather complicated transformations, similar to the calculation of the features above (only in the opposite direction)

In [None]:
lat_min = df.Latitude.min()
long_min = df.Longitude.min()
temp_x = df.Latitude-lat_min
temp_y = df.Longitude-long_min
x_max = temp_x.max()
y_max = temp_y.max()

def coords_to_latlong(x, y, lat_min, long_min, x_max, y_max):
    return (x*x_max)+lat_min, (y*y_max)+long_min

In [None]:
gdf[gdf.Name=="KATRINA"].ID.unique()

In [None]:
storm_id = "AL122005"
gdf_pred = gdf[["Latitude", "Longitude", "Maximum Wind", "x", "y", "month", "hour", "prev_len", "prev_direction", "vec_len", "vec_direction"]][gdf.ID==storm_id]
X_pred_direction = gdf_pred[["Maximum Wind", "x", "y", "month", "hour", "prev_len", "prev_direction"]].iloc[1:].values
X_pred_len = gdf_pred[["Maximum Wind", "x", "y", "month", "hour", "prev_len", "prev_direction"]].iloc[1:].values

In [None]:
df_pred = pd.DataFrame({"pred_len": model_length.predict(X_pred_len), "pred_direction": model_direction.predict(X_pred_direction)})

In [None]:
df_pred["pred_x"] = np.nan
df_pred["pred_y"] = np.nan
df_pred["real_x"] = gdf_pred["x"].iloc[1:].values
df_pred["real_y"] = gdf_pred["y"].iloc[1:].values

In [None]:
last_x = gdf_pred["x"].iloc[0]
last_y = gdf_pred["y"].iloc[0]
coords = {"x": [], "y": [], "id": []}
for i, row in df_pred.iterrows():
    vector = np.array([0,1])
    R = np.array([[np.cos(row.pred_direction), -np.sin(row.pred_direction)], [np.sin(row.pred_direction), np.cos(row.pred_direction)]])
    vector = np.matmul(vector, R)
    vector = vector/np.linalg.norm(vector)
    vector *= row.pred_len
    coords["x"].append(last_x+vector[0])
    coords["y"].append(last_y+vector[1])
    coords["id"].append(i)
    last_x = row.real_x
    last_y = row.real_y
df_pred.loc[coords["id"], "pred_x"] = coords["x"]
df_pred.loc[coords["id"], "pred_y"] = coords["y"]

In [None]:
df_pred["pred_Longitude"] = np.nan
df_pred["real_Longitude"] = np.nan
df_pred["pred_Latitude"] = np.nan
df_pred["real_Latitude"] = np.nan
latslongs = {"pred_Longitude": [], "real_Longitude": [], "pred_Latitude": [], "real_Latitude": [], "id": []}
for i, row in df_pred.iterrows():
    real_Latitude, real_Longitude = coords_to_latlong(row.real_x, row.real_y, lat_min, long_min, x_max, y_max)
    latslongs["real_Latitude"].append(real_Latitude)
    latslongs["real_Longitude"].append(real_Longitude)
    pred_Latitude, pred_Longitude = coords_to_latlong(row.pred_x, row.pred_y, lat_min, long_min, x_max, y_max)
    latslongs["pred_Latitude"].append(pred_Latitude)
    latslongs["pred_Longitude"].append(pred_Longitude)
    latslongs["id"].append(i)
df_pred.loc[latslongs["id"], "real_Latitude"] = latslongs["real_Latitude"]
df_pred.loc[latslongs["id"], "real_Longitude"] = latslongs["real_Longitude"]
df_pred.loc[latslongs["id"], "pred_Latitude"] = latslongs["pred_Latitude"]
df_pred.loc[latslongs["id"], "pred_Longitude"] = latslongs["pred_Longitude"]

In [None]:
df_pred

In [None]:
gdf_real = gpd.GeoDataFrame(df_pred,geometry=gpd.points_from_xy(df_pred.real_Longitude,df_pred.real_Latitude), crs={'init' :'epsg:4326'})
gdf_pred = gpd.GeoDataFrame(df_pred,geometry=gpd.points_from_xy(df_pred.pred_Longitude,df_pred.pred_Latitude), crs={'init' :'epsg:4326'})

In [None]:
ax = world.plot(color="white", edgecolor="black", figsize=(25,12))
_ = gdf_real.plot(ax=ax, c="r", marker="x", alpha=0.5)
_ = gdf_pred.plot(ax=ax, c="b", marker="x", alpha=0.5)
_ = plt.xlim(-120, 20)
_ = plt.ylim(-10, 60)