In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer, OneHotEncoder, StandardScaler
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.linear_model import SGDRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

from xgboost import XGBRegressor


In [None]:
# df that has already been cleaned
df = pd.read_csv('data/csv_ready/jma_final', index_col=0)

In [None]:
df.index = pd.to_datetime(df.index)

In [None]:
df

In [None]:
df[df.id == 200]

In [None]:
df = df.merge(df.groupby('id')[['max_wind_kn', 'min_pressure_mBar', 'velocity_kn', 'direction_sin', 'direction_cos', 'x', 'y', 'z']].shift(-1), left_index=True, right_index=True).dropna()

In [None]:
df[(df.index.year == 2023)]

In [None]:
df.x_x.corr(df.x_y)

In [None]:
df.columns

In [None]:
attrs, tgts = df[['max_wind_kn_x', 'min_pressure_mBar_x', 'enso', 'velocity_kn_x', 'direction_sin_x', 'direction_cos_x', 'x_x', 'y_x', 'z_x']], df[['max_wind_kn_y', 'min_pressure_mBar_y', 'velocity_kn_y', 'direction_sin_y', 'direction_cos_y', 'x_y', 'y_y', 'z_y']]

In [None]:
splitter = GroupShuffleSplit(n_splits=2, test_size=0.2, random_state=97)
groups = df['id']

for train_idx, test_idx in splitter.split(attrs, tgts, groups):
    X_train, X_test = attrs.iloc[train_idx], attrs.iloc[test_idx]
    y_train, y_test = tgts.iloc[train_idx], tgts.iloc[test_idx]
    

In [None]:
X_train = X_train.rename(columns={'max_wind_kn_x': 'max_wind_kn', 'min_pressure_mBar_x': 'min_pressure_mBar', 'velocity_kn_x': 'velocity_kn', 'direction_sin_x': 'direction_sin', 'direction_cos_x': 'direction_cos'})

In [None]:
y_train = y_train.rename(columns={'max_wind_kn_y': 'max_wind_kn', 'min_pressure_mBar_y': 'min_pressure_mBar', 'velocity_kn_y': 'velocity_kn', 'direction_sin_y': 'direction_sin', 'direction_cos_y': 'direction_cos'})

In [None]:
X_train

In [None]:
y_train

In [None]:
# # Define the parameter grid
# param_grid = {
#     'regressor__estimator__n_estimators': [100, 200, 300],         # Number of trees
#     'regressor__estimator__max_depth': [5, 10, 15],                # Depth of each tree
#     'regressor__estimator__min_samples_split': [2, 5, 10],         # Minimum samples for splitting
# }

# # Set up TimeSeriesSplit for time-based cross-validation
# tscv = TimeSeriesSplit(n_splits=5)

# # Define GridSearchCV
# grid_search = GridSearchCV(
#     pipeline, param_grid, cv=tscv, scoring='r2', n_jobs=-1, verbose=2
# )

# # Fit the GridSearchCV to find the best parameters
# grid_search.fit(X_train, y_train)

# # Get the best parameters and score
# print("Best Parameters:", grid_search.best_params_)
# print("Best R^2 Score:", grid_search.best_score_)

In [None]:
pipeline.fit(attrs, tgts)

In [None]:
# removing coordinates from the training set
y_train = y_train[['max_wind_kn', 'min_pressure_mBar', 'velocity_kn', 'direction_sin',
       'direction_cos']]

In [None]:
X_test = X_test.rename(columns={'max_wind_kn_x': 'max_wind_kn', 'min_pressure_mBar_x': 'min_pressure_mBar', 'velocity_kn_x': 'velocity_kn', 'direction_sin_x': 'direction_sin', 'direction_cos_x': 'direction_cos'})

In [None]:
y_test =  y_test.rename(columns={'max_wind_kn_y': 'max_wind_kn', 'min_pressure_mBar_y': 'min_pressure_mBar', 'velocity_kn_y': 'velocity_kn', 'direction_sin_y': 'direction_sin', 'direction_cos_y': 'direction_cos'})

In [None]:
pipeline.score(attrs, tgts)

In [None]:
X_test[:3]

In [None]:
y_test[:2]

In [None]:
pipeline.predict(X_test[2:3])

In [None]:
X_train


In [None]:
ds = pd.read_csv('data/csv_ready/jma_final', index_col=0)
ds.index = pd.to_datetime(ds.index)

In [None]:
ds = ds.merge(ds.groupby('id')[['lat', 'lon', 'max_wind_kn', 'min_pressure_mBar', 'enso', 'velocity_kn', 'direction_deg']].shift(-1), left_index=True, right_index=True).dropna()

In [None]:
ds

In [None]:
ds_attrs, ds_tgts = ds[['lat_x', 'lon_x', 'max_wind_kn_x', 'min_pressure_mBar_x', 'velocity_kn_x', 'direction_deg_x', 'enso_x']], ds[['lat_y', 'lon_y', 'max_wind_kn_y', 'min_pressure_mBar_y', 'velocity_kn_y', 'direction_deg_y']]

In [None]:
ds_attrs, ds_tgts = ds[['lat_x', 'lon_x', 'max_wind_kn_x', 'min_pressure_mBar_x', 'velocity_kn_x', 'direction_deg_x', 'enso_x']], ds[['lat_y', 'lon_y']]

In [None]:
ds_attrs = ds_attrs.rename(columns={'enso_x': 'enso'})

In [None]:
pipeline.fit(ds_attrs, ds_tgts)

In [None]:
pipeline.score(ds_attrs, ds_tgts)

In [None]:
ds_attrs[1:2]

In [None]:
ds_attrs[2:3]

In [None]:
pipeline.predict(ds_attrs[1:2])

In [None]:
attrs[1:3]

In [None]:
pipeline.predict(attrs[2:3])

In [4]:
def add_lags(df, features, n_lags, group_col):
    for lag in range(1, n_lags + 1):
        for feature in features:
            df[f"{feature}_lag_{lag}"] = df.groupby(group_col)[feature].shift(lag)
    return df


In [5]:
df = pd.read_csv('data/csv_ready/jma_final', index_col=0)
df.index = pd.to_datetime(df.index)

In [None]:
df[:10]

In [6]:
features = ['max_wind_kn', 'min_pressure_mBar', 'velocity_kn', 'direction_sin', 'direction_cos', 'x', 'y', 'z']
df = add_lags(df, features, n_lags=5, group_col='id')
df = df.dropna()  # Drop rows with missing lag values


In [None]:
df.columns

In [None]:
pd.set_option('display.max_columns', None)
df[:10]

In [8]:
# List of lagged columns (attributes)
attrs = [
    'enso',  # Static feature
    'max_wind_kn_lag_1', 'min_pressure_mBar_lag_1', 'velocity_kn_lag_1', 'direction_sin_lag_1', 'direction_cos_lag_1', 'x_lag_1', 'y_lag_1', 'z_lag_1',
    'max_wind_kn_lag_2', 'min_pressure_mBar_lag_2', 'velocity_kn_lag_2', 'direction_sin_lag_2', 'direction_cos_lag_2', 'x_lag_2', 'y_lag_2', 'z_lag_2',
    'max_wind_kn_lag_3', 'min_pressure_mBar_lag_3', 'velocity_kn_lag_3', 'direction_sin_lag_3', 'direction_cos_lag_3', 'x_lag_3', 'y_lag_3', 'z_lag_3',
    'max_wind_kn_lag_4', 'min_pressure_mBar_lag_4', 'velocity_kn_lag_4', 'direction_sin_lag_4', 'direction_cos_lag_4', 'x_lag_4', 'y_lag_4', 'z_lag_4',
    'max_wind_kn_lag_5', 'min_pressure_mBar_lag_5', 'velocity_kn_lag_5', 'direction_sin_lag_5', 'direction_cos_lag_5', 'x_lag_5', 'y_lag_5', 'z_lag_5',
]

# List of target columns
tgts = ['max_wind_kn', 'min_pressure_mBar', 'velocity_kn', 'direction_sin', 'direction_cos', 'x', 'y', 'z']

# Splitting attributes and targets
attrs = df[attrs]
tgts = df[tgts]

splitter = GroupShuffleSplit(n_splits=2, test_size=0.2, random_state=97)
groups = df['id']

for train_idx, test_idx in splitter.split(attrs, tgts, groups):
    X_train, X_test = attrs.iloc[train_idx], attrs.iloc[test_idx]
    y_train, y_test = tgts.iloc[train_idx], tgts.iloc[test_idx]
    

In [9]:
regressor = MultiOutputRegressor(XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=6))
preprocessor = ColumnTransformer([
        ('enso', OneHotEncoder(), ['enso']),  # One-hot encode the ENSO feature
        # ('poly', PolynomialFeatures(degree=4, include_bias=False), slice(0, None)),
    ], remainder='passthrough', force_int_remainder_cols=False)
scaler = MinMaxScaler()
poly = PolynomialFeatures(degree=4, include_bias=False)
pipeline = Pipeline([
    ('preprocess', preprocessor),  # Apply scaling and encoding
    ('scaler', scaler),
    # ('poly', poly),
    ('regressor', regressor)
])

In [10]:
X_train.shape, y_train.shape

((46766, 41), (46766, 8))

In [11]:
pipeline.fit(X_train, y_train)

In [12]:
pipeline.score(X_train, y_train)

0.9427871704101562

In [13]:
pipeline.score(X_test, y_test)

0.9010117053985596

In [14]:
X_test[6:7]

Unnamed: 0_level_0,enso,max_wind_kn_lag_1,min_pressure_mBar_lag_1,velocity_kn_lag_1,direction_sin_lag_1,direction_cos_lag_1,x_lag_1,y_lag_1,z_lag_1,max_wind_kn_lag_2,...,y_lag_4,z_lag_4,max_wind_kn_lag_5,min_pressure_mBar_lag_5,velocity_kn_lag_5,direction_sin_lag_5,direction_cos_lag_5,x_lag_5,y_lag_5,z_lag_5
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1951-04-17 18:00:00,-1,35.0,998.0,10.988454,-0.363137,0.931736,-0.777589,0.583831,0.233445,35.0,...,0.568638,0.190809,35.0,1000.0,10.770351,-0.828177,0.560466,-0.811101,0.557455,0.177085


In [15]:
y_test[6:7]

Unnamed: 0_level_0,max_wind_kn,min_pressure_mBar,velocity_kn,direction_sin,direction_cos,x,y,z
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1951-04-17 18:00:00,35,996,9.119262,-0.436642,0.899636,-0.770826,0.587212,0.246999


In [None]:
pipeline.predict(X_test[6:7])

In [None]:
X_train[:10]

In [None]:
y_train[:10]

In [None]:
X_test[:1]

In [None]:
y_test[:1]

In [None]:
pipeline.predict(X_test[:1])

In [None]:
ds

In [16]:
ds = pd.read_csv('data/csv_ready/jma_final', index_col=0)
ds.index = pd.to_datetime(ds.index)

In [17]:
features = ['lat', 'lon', 'max_wind_kn', 'min_pressure_mBar', 'velocity_kn', 'direction_deg']
ds = add_lags(ds, features, n_lags=5, group_col='id')
ds = ds.dropna()  # Drop rows with missing lag values


In [18]:
# List of lagged columns (attributes)
attrs = ['enso'] # Static feature
for i in range(1, 6):
    for feature in features:
        attrs.append(f'{feature}_lag_{i}')

# attrs = [
#     'enso',  
#     'max_wind_kn_lag_1', 'min_pressure_mBar_lag_1', 'velocity_kn_lag_1', 'direction_sin_lag_1', 'direction_cos_lag_1', 'x_lag_1', 'y_lag_1', 'z_lag_1',
#     'max_wind_kn_lag_2', 'min_pressure_mBar_lag_2', 'velocity_kn_lag_2', 'direction_sin_lag_2', 'direction_cos_lag_2', 'x_lag_2', 'y_lag_2', 'z_lag_2',
#     'max_wind_kn_lag_3', 'min_pressure_mBar_lag_3', 'velocity_kn_lag_3', 'direction_sin_lag_3', 'direction_cos_lag_3', 'x_lag_3', 'y_lag_3', 'z_lag_3',
#     'max_wind_kn_lag_4', 'min_pressure_mBar_lag_4', 'velocity_kn_lag_4', 'direction_sin_lag_4', 'direction_cos_lag_4', 'x_lag_4', 'y_lag_4', 'z_lag_4',
#     'max_wind_kn_lag_5', 'min_pressure_mBar_lag_5', 'velocity_kn_lag_5', 'direction_sin_lag_5', 'direction_cos_lag_5', 'x_lag_5', 'y_lag_5', 'z_lag_5',
# ]

# List of target columns
tgts = features.copy()


# Splitting attributes and targets
attrs = ds[attrs]
tgts = ds[tgts]

tgts = tgts.drop(columns=['velocity_kn', 'direction_deg'])

splitter = GroupShuffleSplit(n_splits=2, test_size=0.2, random_state=97)
groups = ds['id']

for train_idx, test_idx in splitter.split(attrs, tgts, groups):
    X_train, X_test = attrs.iloc[train_idx], attrs.iloc[test_idx]
    y_train, y_test = tgts.iloc[train_idx], tgts.iloc[test_idx]
    

In [19]:
pipeline.fit(X_train, y_train)

In [20]:
pipeline.score(X_test, y_test)

0.9808290004730225

In [21]:
X_test[:2]

Unnamed: 0_level_0,enso,lat_lag_1,lon_lag_1,max_wind_kn_lag_1,min_pressure_mBar_lag_1,velocity_kn_lag_1,direction_deg_lag_1,lat_lag_2,lon_lag_2,max_wind_kn_lag_2,...,max_wind_kn_lag_4,min_pressure_mBar_lag_4,velocity_kn_lag_4,direction_deg_lag_4,lat_lag_5,lon_lag_5,max_wind_kn_lag_5,min_pressure_mBar_lag_5,velocity_kn_lag_5,direction_deg_lag_5
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1951-04-16 06:00:00,-1,9.2,147.2,35.0,1000.0,11.608231,290.212189,8.8,148.3,35.0,...,35.0,1002.0,12.271852,284.173723,8.0,151.5,35.0,1002.0,0.0,0.0
1951-04-16 12:00:00,-1,9.6,146.4,35.0,1000.0,8.883069,296.876183,9.2,147.2,35.0,...,35.0,1002.0,10.356606,286.872171,8.3,150.3,35.0,1002.0,12.271852,284.173723


In [22]:
y_test[:1]

Unnamed: 0_level_0,lat,lon,max_wind_kn,min_pressure_mBar
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1951-04-16 06:00:00,9.6,146.4,35,1000


In [23]:
pipeline.predict(X_test[:1])

array([[  9.494847, 146.18164 ,  34.97659 , 998.8098  ]], dtype=float32)