In [1]:
# Code source: Gaël Varoquaux
#              Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

# basic tools
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.datasets import make_moons, make_circles, make_classification

# # classifiers from sklearns
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
# from sklearn.naive_bayes import GaussianNB
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# # from xgboost import XGBClassifier
# from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score 

In [2]:
PATH = '/home/disk/orca/adaley17/MLGEO2024_TC_Tracks_Intensity/data/'
bt_data = 'clean/cleaned_best_track_data.csv'
ships_data = 'ai_ready/ai_ready_SHIPS_data.csv'

df_SHIPS = pd.read_csv(os.path.join(PATH, ships_data))
df_BT = pd.read_csv(os.path.join(PATH, bt_data))

In [3]:
df_SHIPS


Unnamed: 0,Original_Times,Code,Times,Daily_SST_Avg,Mid_Level_RH,Vert_Vel,Vshear,MSLP,Vmax,Latitude,Longitude
0,1982-06-02 12:00:00,AL011982,-12.0,,,,,,,,
1,1982-06-02 12:00:00,AL011982,-6.0,,,,,,,,
2,1982-06-02 12:00:00,AL011982,0.0,27.1,57.0,6.631183,15.227542,1005.0,10.28888,21.7,87.1
3,1982-06-02 12:00:00,AL011982,6.0,27.8,53.0,3.966363,15.536209,1004.0,12.86110,22.2,86.5
4,1982-06-02 12:00:00,AL011982,12.0,27.9,56.0,4.403641,15.330431,1003.0,15.43332,22.6,85.8
...,...,...,...,...,...,...,...,...,...,...,...
318545,2022-11-11 12:00:00,AL172022,96.0,,,,,,,,
318546,2022-11-11 12:00:00,AL172022,102.0,,,,,,,,
318547,2022-11-11 12:00:00,AL172022,108.0,,,,,,,,
318548,2022-11-11 12:00:00,AL172022,114.0,,,,,,,,


In [4]:
df_BT.head()

Unnamed: 0,ISO_TIME,USA_ATCF_ID,NAME,DIST2LAND,LANDFALL,STORM_SPEED,STORM_DIR,USA_LAT,USA_LON,USA_WIND,WMO_WIND,WMO_PRES,USA_PRES,USA_RMW,USA_WSPD
0,1982-06-02 12:00:00,AL011982,ALBERTO,22,40744.0,3.601108,47,21.7,-87.1,10.28888,10.28888,1005.0,1005.0,,10.28888
1,1982-06-02 15:00:00,AL011982,ALBERTO,59,109268.0,4.115552,48,21.9551,-86.8077,11.317768,,,1004.0,,11.317768
2,1982-06-02 18:00:00,AL011982,ALBERTO,93,172236.0,4.115552,52,22.2,-86.5,12.8611,12.8611,1004.0,1004.0,,12.8611
3,1982-06-02 21:00:00,AL011982,ALBERTO,130,209276.0,4.115552,58,22.4224,-86.165,13.889988,,,1003.0,,13.889988
4,1982-06-03 00:00:00,AL011982,ALBERTO,113,181496.0,4.115552,68,22.6,-85.8,15.43332,15.43332,1003.0,1003.0,,15.43332


## Attempting to use BT Data as verification

### Subsetting the first 30 hours of BT and SHIPS Data

In [5]:
# Subsampling the data so that we only have the first 24 hours of each storm
df_BT_24 = df_BT.groupby('USA_ATCF_ID').head(30) #This will be our Obseravtion data


# Filter the DataFrame for Times from 0 to 24 for each Code
filtered_df = df_SHIPS[(df_SHIPS['Times'] >= 0) & (df_SHIPS['Times'] <= 30)]

# Group by Code and extract the relevant columns
df_SHIPS_24 = filtered_df.groupby('Code').apply(lambda x: x[['Original_Times', 'Code', 'Times', 'Latitude', 'Longitude', 'Vmax', 'MSLP', 'Daily_SST_Avg', 'Mid_Level_RH', 'Vshear', 'Vert_Vel']]).reset_index(drop=True)

# df_SHIPS_24


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_SHIPS_24 = filtered_df.groupby('Code').apply(lambda x: x[['Original_Times', 'Code', 'Times', 'Latitude', 'Longitude', 'Vmax', 'MSLP', 'Daily_SST_Avg', 'Mid_Level_RH', 'Vshear', 'Vert_Vel']]).reset_index(drop=True)


### Identifying common codes across datasets

In [6]:
# Set 'Code' as an index level in df_SHIPS_24
df_SHIPS_24 = df_SHIPS_24.set_index('Code')

# Extract the unique IDs from both dataframes
bt_ids = df_BT_24['USA_ATCF_ID'].unique()
ships_ids = df_SHIPS_24.index.get_level_values('Code').unique()

# Find common IDs
common_ids = np.intersect1d(bt_ids, ships_ids)

# Find IDs only in df_BT_24
bt_only_ids = np.setdiff1d(bt_ids, ships_ids)

# Find IDs only in df_SHIPS_24
ships_only_ids = np.setdiff1d(ships_ids, bt_ids)

print(f"Common IDs: {len(common_ids)}")
print(f"IDs only in df_BT_24: {len(bt_only_ids)}")
print(f"IDs only in df_SHIPS_24: {len(ships_only_ids)}")



Common IDs: 539
IDs only in df_BT_24: 23
IDs only in df_SHIPS_24: 71


### Keeping Common IDs

In [7]:
# Filter df_BT_24 to keep only common IDs
df_BT_24_common = df_BT_24[df_BT_24['USA_ATCF_ID'].isin(common_ids)]

# Filter df_SHIPS_24 to keep only common IDs
df_SHIPS_24_common = df_SHIPS_24[df_SHIPS_24.index.get_level_values('Code').isin(common_ids)]

# Check if the lengths of the filtered dataframes are the same
print(f"Length of df_BT_24_common: {len(df_BT_24_common)}")
print(f"Length of df_SHIPS_24_common: {len(df_SHIPS_24_common)}")

Length of df_BT_24_common: 15287
Length of df_SHIPS_24_common: 79440


### Matching Times

In [8]:
new_times = [None] * len(df_SHIPS_24_common['Original_Times']) #Empty list to store the new times
for i in range(len(df_SHIPS_24_common['Original_Times'])):
    
    original_time = pd.to_datetime(df_SHIPS_24_common['Original_Times'].iloc[i]) #Time when prediction was made

    # print(original_time, original_time + pd.Timedelta(hours=df_SHIPS_24_common['Times'].iloc[i]))
    new_times[i] = original_time + pd.Timedelta(hours=df_SHIPS_24_common['Times'].iloc[i]) #Add the time of prediction to the time when prediction was made

    
df_SHIPS_24_common.loc[:, 'New_Times'] = new_times #Add the new times to the DataFrame

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_SHIPS_24_common.loc[:, 'New_Times'] = new_times #Add the new times to the DataFrame


In [9]:
df_SHIPS_24_common

# Convert ISO_TIME to datetime format
df_BT_24_common['ISO_TIME'] = pd.to_datetime(df_BT_24_common['ISO_TIME'])

df_merged = pd.merge(df_SHIPS_24_common.reset_index(), 
df_BT_24_common[['ISO_TIME', 'USA_ATCF_ID', 'USA_WSPD', 'USA_PRES', 
                    'WMO_WIND', 'USA_WIND', 'STORM_DIR', 'WMO_PRES']], 
            left_on=['New_Times', 'Code'], right_on=['ISO_TIME', 'USA_ATCF_ID'], how='inner')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_BT_24_common['ISO_TIME'] = pd.to_datetime(df_BT_24_common['ISO_TIME'])


In [10]:
df_merged_clean = df_merged.dropna()
df_merged_clean = df_merged_clean.drop(columns=['Times', 'Original_Times', 'ISO_TIME', 'New_Times'])
df_merged_clean

Unnamed: 0,Code,Latitude,Longitude,Vmax,MSLP,Daily_SST_Avg,Mid_Level_RH,Vshear,Vert_Vel,USA_ATCF_ID,USA_WSPD,USA_PRES,WMO_WIND,USA_WIND,STORM_DIR,WMO_PRES
0,AL011982,21.7,87.1,10.28888,1005.0,27.1,57.0,15.227542,6.631183,AL011982,10.28888,1005.0,10.28888,10.28888,47,1005
1,AL011982,22.2,86.5,12.86110,1004.0,27.8,53.0,15.536209,3.966363,AL011982,12.86110,1004.0,12.86110,12.86110,52,1004
2,AL011982,22.2,86.5,12.86110,1004.0,27.8,53.0,15.536209,3.966363,AL011982,12.86110,1004.0,12.86110,12.86110,52,1004
3,AL011982,22.6,85.8,15.43332,1003.0,27.9,56.0,15.330431,4.403641,AL011982,15.43332,1003.0,15.43332,15.43332,68,1003
4,AL011982,22.6,85.8,15.43332,1003.0,27.9,56.0,15.330431,4.403641,AL011982,15.43332,1003.0,15.43332,15.43332,68,1003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32379,AL172022,26.5,77.3,30.86664,985.0,27.6,49.0,18.571428,5.730906,AL172022,30.86664,985.0,30.86664,30.86664,270,985
32380,AL172022,26.5,77.3,30.86664,985.0,27.5,49.0,18.571428,5.730906,AL172022,30.86664,985.0,30.86664,30.86664,270,985
32381,AL172022,26.5,77.3,30.86664,985.0,27.5,49.0,18.571428,5.730906,AL172022,30.86664,985.0,30.86664,30.86664,270,985
32382,AL172022,26.5,77.3,30.86664,985.0,27.5,49.0,18.571428,5.730906,AL172022,30.86664,985.0,30.86664,30.86664,270,985


# Pycaret

In [11]:
import pycaret
import joblib
from pycaret.regression import *
from pycaret.regression import setup


In [12]:
df_merged_clean

Unnamed: 0,Code,Latitude,Longitude,Vmax,MSLP,Daily_SST_Avg,Mid_Level_RH,Vshear,Vert_Vel,USA_ATCF_ID,USA_WSPD,USA_PRES,WMO_WIND,USA_WIND,STORM_DIR,WMO_PRES
0,AL011982,21.7,87.1,10.28888,1005.0,27.1,57.0,15.227542,6.631183,AL011982,10.28888,1005.0,10.28888,10.28888,47,1005
1,AL011982,22.2,86.5,12.86110,1004.0,27.8,53.0,15.536209,3.966363,AL011982,12.86110,1004.0,12.86110,12.86110,52,1004
2,AL011982,22.2,86.5,12.86110,1004.0,27.8,53.0,15.536209,3.966363,AL011982,12.86110,1004.0,12.86110,12.86110,52,1004
3,AL011982,22.6,85.8,15.43332,1003.0,27.9,56.0,15.330431,4.403641,AL011982,15.43332,1003.0,15.43332,15.43332,68,1003
4,AL011982,22.6,85.8,15.43332,1003.0,27.9,56.0,15.330431,4.403641,AL011982,15.43332,1003.0,15.43332,15.43332,68,1003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32379,AL172022,26.5,77.3,30.86664,985.0,27.6,49.0,18.571428,5.730906,AL172022,30.86664,985.0,30.86664,30.86664,270,985
32380,AL172022,26.5,77.3,30.86664,985.0,27.5,49.0,18.571428,5.730906,AL172022,30.86664,985.0,30.86664,30.86664,270,985
32381,AL172022,26.5,77.3,30.86664,985.0,27.5,49.0,18.571428,5.730906,AL172022,30.86664,985.0,30.86664,30.86664,270,985
32382,AL172022,26.5,77.3,30.86664,985.0,27.5,49.0,18.571428,5.730906,AL172022,30.86664,985.0,30.86664,30.86664,270,985


In [13]:
df_track_data = df_merged_clean[['Latitude', 'Longitude', 'STORM_DIR', 'Vshear', 'Daily_SST_Avg']]
df_intensity_data = df_merged_clean[['Vmax', 'USA_WSPD', 'MSLP',  'USA_PRES','Daily_SST_Avg', 'Mid_Level_RH', 'Vshear', 'Vert_Vel']]

In [14]:
df_track_data.Longitude.shape


(31316,)

# Building Track Model

## Compare Lat Models

In [46]:
s_latitude = setup(data=df_track_data, verbose=False,   
                   target='Latitude', session_id=42)

In [47]:
# compare best model
best_lat_model = compare_models(include=['et', 'rf', 'lightgbm', 'xgboost', 'catboost'], fold=5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.378,1.4952,1.2223,0.9765,0.0581,0.0196,0.922
rf,Random Forest Regressor,0.7046,2.0532,1.4328,0.9677,0.0675,0.0358,0.7
xgboost,Extreme Gradient Boosting,1.7945,6.2548,2.5003,0.9017,0.1199,0.0923,0.588
catboost,CatBoost Regressor,2.2946,9.3241,3.0532,0.8534,0.1435,0.1165,2.018
lightgbm,Light Gradient Boosting Machine,2.4951,10.7311,3.2756,0.8313,0.1527,0.1258,41.362


Processing:   0%|          | 0/25 [00:00<?, ?it/s]

The algorithms evaluated are shown above. The most promising algorithms are extra trees, random forrest and decision tree regressor along with Extreme Gradient Boosting. The Extra Trees Regressor Model seem to work best at predicting the lat position of the storm. We will save the model for hypertuning later. The important parameters are listed below. Now we will try to tune the model.

In [53]:
best_lat_model.get_params()

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [None]:
ET_lat = create_model('et')
ET_lat.get_params()

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2784,1.102,1.0498,0.9822,0.0517,0.0147
1,0.2907,1.0336,1.0166,0.9836,0.0479,0.0148
2,0.34,1.3868,1.1776,0.978,0.0561,0.0175
3,0.298,1.0316,1.0157,0.9831,0.0477,0.0152
4,0.2594,0.9771,0.9885,0.985,0.046,0.0131
5,0.3474,1.5253,1.235,0.9761,0.0585,0.0178
6,0.3296,1.3134,1.146,0.98,0.0547,0.0174
7,0.3104,1.168,1.0808,0.9818,0.0531,0.0167
8,0.3098,1.3416,1.1583,0.9794,0.0546,0.0158
9,0.314,1.2666,1.1255,0.98,0.0526,0.016


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [48]:
tuned_best_lat_model = tune_model(best_lat_model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.9668,14.4084,3.7958,0.7679,0.1757,0.15
1,2.9911,14.9628,3.8682,0.7619,0.179,0.1514
2,3.1849,17.0392,4.1279,0.7299,0.189,0.1596
3,3.0308,14.9716,3.8693,0.7547,0.1797,0.1546
4,3.0558,15.4791,3.9344,0.7628,0.183,0.1571
5,3.0113,14.6259,3.8244,0.7712,0.179,0.1553
6,3.01,14.767,3.8428,0.7751,0.1779,0.1524
7,3.0744,15.4064,3.9251,0.7605,0.1849,0.1596
8,3.0622,15.4986,3.9368,0.7617,0.1822,0.1551
9,3.0903,15.4361,3.9289,0.7557,0.1806,0.1568


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Tuning the model appears to make the model perform worse. So we will revert to the previous model

In [51]:
tuned_best_lat_model.get_params()

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

## Compare Lon Models

In [57]:
s_longitude = setup(data=df_track_data, verbose=False,   
                   target='Longitude', session_id=42)
best_lon_model = compare_models(include=['et', 'rf', 'lightgbm', 'xgboost', 'catboost'], fold=5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,1.5516,21.1199,4.5946,0.9505,0.0864,0.0298,0.964
rf,Random Forest Regressor,2.5752,24.993,4.9979,0.9414,0.0953,0.0492,0.702
xgboost,Extreme Gradient Boosting,5.7898,62.1583,7.8812,0.8542,0.145,0.1078,0.58
catboost,CatBoost Regressor,7.3206,89.3619,9.4527,0.7905,0.1721,0.1361,2.05
lightgbm,Light Gradient Boosting Machine,7.8852,101.4795,10.0734,0.7621,0.1829,0.1468,42.478


Processing:   0%|          | 0/25 [00:00<?, ?it/s]

The algorithms evaluated are shown above. The most promising algorithms are extra trees, random forrest and decision tree regressor along with Extreme Gradient Boosting. The Extra Trees Regressor Model seem to work best at predicting the lat position of the storm. We will save the model for hypertuning later. The important parameters are listed below. Now we will try to tune the model.

In [58]:
best_lat_model.get_params()

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [59]:
ET_lon = create_model('et')
ET_lon.get_params()

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.3061,18.2374,4.2705,0.9573,0.0809,0.025
1,1.3576,20.5782,4.5363,0.9517,0.0876,0.0261
2,1.2542,17.7694,4.2154,0.9565,0.0712,0.0217
3,1.3346,17.7005,4.2072,0.9588,0.0823,0.0269
4,1.1285,14.0175,3.744,0.9674,0.0743,0.0235
5,1.3958,20.9995,4.5825,0.9502,0.0916,0.028
6,1.4082,17.266,4.1552,0.9603,0.0789,0.0273
7,1.2905,16.7846,4.0969,0.9612,0.0773,0.025
8,1.1427,13.8093,3.7161,0.9674,0.0653,0.0207
9,1.2795,19.1225,4.3729,0.9557,0.0804,0.0245


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [60]:
tuned_best_lon_model = tune_model(best_lon_model)
tuned_best_lon_model.get_params()

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,9.3428,135.9587,11.6601,0.6813,0.2175,0.1809
1,9.1895,130.5916,11.4277,0.6937,0.2122,0.1778
2,8.875,123.6303,11.1189,0.697,0.1999,0.1638
3,9.3294,131.2397,11.456,0.6948,0.2147,0.1812
4,9.0484,125.0979,11.1847,0.709,0.2122,0.1788
5,9.2002,134.0059,11.5761,0.6819,0.2132,0.1751
6,9.1698,130.2526,11.4128,0.7005,0.2094,0.1738
7,9.1087,128.9119,11.3539,0.7022,0.2094,0.1751
8,9.2561,132.4731,11.5097,0.6871,0.2087,0.1742
9,9.0197,126.6127,11.2522,0.7067,0.206,0.1706


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


{'bootstrap': False,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

Tuning the model appears to make the model perform worse. So we will revert to the previous model

# Building Intensity Model

In [69]:

s_int = setup(data=df_intensity_data.drop(columns=['USA_WSPD']), verbose=False,   
                   target = 'Vmax', session_id=42)
best_int_model = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.1958,0.6256,0.7864,0.9935,0.0397,0.0106,0.093
rf,Random Forest Regressor,0.4816,1.094,1.0427,0.9886,0.051,0.024,0.158
dt,Decision Tree Regressor,0.2552,1.6298,1.2721,0.983,0.0614,0.0135,0.016
xgboost,Extreme Gradient Boosting,1.4892,4.4518,2.109,0.9534,0.0952,0.0698,0.041
knn,K Neighbors Regressor,1.3069,4.4954,2.1184,0.953,0.0972,0.0622,0.021
catboost,CatBoost Regressor,1.868,6.4872,2.5455,0.9322,0.1132,0.0869,1.278
lightgbm,Light Gradient Boosting Machine,2.0656,7.8173,2.7946,0.9182,0.1239,0.0962,42.469
gbr,Gradient Boosting Regressor,2.4347,10.8192,3.2879,0.8868,0.1434,0.1127,0.17
ridge,Ridge Regression,2.7357,13.5259,3.6769,0.8585,0.1606,0.1279,0.276
br,Bayesian Ridge,2.7357,13.5259,3.6769,0.8585,0.1606,0.1279,0.011


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

The algorithms evaluated are shown above. The most promising algorithms are extra trees, random forrest and decision tree regressor along with Extreme Gradient Boosting. The Extra Trees Regressor Model seem to work best at predicting the lat position of the storm. We will save the model for hypertuning later. The important parameters are listed below. Now we will try to tune the model.

In [70]:
best_int_model.get_params()

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [71]:
ET_int = create_model('et')
ET_int.get_params()

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.1949,0.5347,0.7313,0.9942,0.0354,0.01
1,0.1749,0.5419,0.7361,0.9942,0.0386,0.01
2,0.1979,0.6906,0.831,0.9927,0.0399,0.0103
3,0.2012,0.7102,0.8428,0.9931,0.0422,0.0102
4,0.1434,0.3629,0.6024,0.9962,0.0307,0.0077
5,0.2274,0.8438,0.9186,0.991,0.0494,0.013
6,0.205,0.6624,0.8139,0.9932,0.0413,0.0114
7,0.2095,0.7484,0.8651,0.9921,0.042,0.0111
8,0.1872,0.5467,0.7394,0.9944,0.0375,0.0103
9,0.2168,0.614,0.7836,0.9935,0.0399,0.012


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

Tuning the model appears to make the model perform worse. So we will revert to the previous model