In [1]:
import pandas as pd


In [2]:
# Load your data
df = pd.read_csv("2001-2022storms.csv")
df

Unnamed: 0,ID,NAME,YEAR,MONTH,DAY,HOUR,LAT,LONG,STATUS,CATEGORY,WIND,PRESSURE,TROPICALSTORM_FORCE_DIAMETER,HURRICANE_FORCE_DIAMETER
0,ALLISON200106,Allison,2001,6,5,12,27.5,-95.0,Tropical Storm,,40,1007,,
1,ALLISON200106,Allison,2001,6,5,18,28.5,-95.3,Tropical Storm,,50,1002,,
2,ALLISON200106,Allison,2001,6,5,21,28.9,-95.3,Tropical Storm,,45,1003,,
3,ALLISON200106,Allison,2001,6,6,0,29.3,-95.3,Tropical Storm,,45,1003,,
4,ALLISON200106,Allison,2001,6,6,6,30.1,-95.2,Tropical Depression,,30,1006,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11582,NICOLE202211,Nicole,2022,11,10,19,29.2,-83.0,Tropical Storm,,40,989,300.0,0.0
11583,NICOLE202211,Nicole,2022,11,11,0,30.1,-84.0,Tropical Storm,,35,992,300.0,0.0
11584,NICOLE202211,Nicole,2022,11,11,6,31.2,-84.6,Tropical Depression,,30,996,0.0,0.0
11585,NICOLE202211,Nicole,2022,11,11,12,33.2,-84.6,Tropical Depression,,25,999,0.0,0.0


In [3]:
df.to_csv('updated_data.csv', index=False)

In [4]:
# Convert the year, month, day, and hour columns to a datetime format
df['DATE'] = pd.to_datetime(df[['YEAR', 'MONTH', 'DAY', 'HOUR']])

# Group by the storm ID and calculate the duration for each storm
df['DURATION'] = df.groupby('ID')['DATE'].transform(lambda x: x.max() - x.min())

# Convert the duration to hours
df['DURATION'] = df['DURATION'].dt.total_seconds() / 3600  # This will give you the duration in hours

# Now, you can get the unique storm durations
unique_storm_durations = df.drop_duplicates(subset='ID')[['ID', 'DURATION']]

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Select your features
features = ['YEAR', 'MONTH', 'DAY', 'HOUR', 'STATUS', 'CATEGORY', 'WIND', 'PRESSURE', 'LAT', 'LONG']
X = df[features]

# One-hot encode the 'status' and 'category' columns
X = pd.get_dummies(X, columns=['STATUS', 'CATEGORY'])

# Select your target
y = df['DURATION']

# Split your data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize your data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
from sklearn.ensemble import RandomForestRegressor

# Define the model
model = RandomForestRegressor()

# Train the model
model.fit(X_train, y_train)

In [7]:
# Evaluate the model
print('Test Score for storm duration prediction model: ', model.score(X_test, y_test))

Test Score for storm duration prediction model:  0.9247555574070968


In [8]:
# Assume next_storm is a DataFrame containing the features for a new storm
next_storm = pd.DataFrame([[2001, 8, 2, 12, 'Tropical Depression', '', 30, 1011, 25.7, -84.8]], 
                          columns=['YEAR', 'MONTH', 'DAY', 'HOUR', 'STATUS', 'CATEGORY', 'WIND', 'PRESSURE', 'LAT', 'LONG'])

# One-hot encode the 'status' and 'category' columns to match the training data
next_storm = pd.get_dummies(next_storm, columns=['STATUS', 'CATEGORY'])

# Add missing columns filled with zeros to match the columns of the training data
missing_cols = set(X.columns) - set(next_storm.columns)
for c in missing_cols:
    next_storm[c] = 0

# Order the columns to match the order of the training data columns
next_storm = next_storm[X.columns]

# Normalize the data
next_storm_normalized = scaler.transform(next_storm)

# Predict the duration
duration_pred = model.predict(next_storm_normalized)

print('Predicted duration: ', duration_pred[0])

Predicted duration:  138.12
