In [85]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from lightgbm import LGBMClassifier, early_stopping
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost.callback import EarlyStopping

In [86]:
train_df = pd.read_csv('./dataset/train.csv')
test_df = pd.read_csv('./dataset/test.csv')

In [87]:
def create_time_features(df):
    # Convert time columns to datetime
    df['datetime'] = pd.to_datetime(df[['year', 'month', 'day', 'hour', 'minute']])
    # Extract time features
    df['day_of_week'] = df['datetime'].dt.dayofweek
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)
    df['minute_sin'] = np.sin(2 * np.pi * df['minute']/24)
    df['minute_cos'] = np.cos(2 * np.pi * df['minute']/24)
    return df

def create_spatial_features(df):
    # Avoid division by zero and handle size=0
    df['intensity_density'] = df['intensity'] / (df['size'].replace(1, np.nan))
    df['intensity_density'] = df['intensity_density'].fillna(0)
    df['storm_proximity'] = 1 / (df['distance'] + 1)

    return df

def create_storm_features(df):
    # Nosy Be Specific Cyclone Season (November to April)
    df['is_peak_cyclone_season'] = df['month'].apply(lambda x: 1 if x in [1, 2, 3] else 0)
    df['is_cyclone_season'] = df['month'].apply(lambda x: 1 if x in [11, 12, 1, 2, 3, 4] else 0)

    # Assign weights to months based on historical cyclone data
    cyclone_weights = {1: 0.9, 2: 0.8, 3: 0.7, 4: 0.4, 11: 0.6, 12: 0.7}
    df['cyclone_season_weight'] = df['month'].map(cyclone_weights).fillna(0)

    # Define day as 6 AM to 6 PM
    df['is_daytime'] = df['hour'].apply(lambda x: 1 if 6 <= x < 18 else 0)

    df['cyclone_daytime_interaction'] = df['is_cyclone_season'] * df['is_daytime']
    df['peak_cyclone_daytime_interaction'] = df['is_peak_cyclone_season'] * df['is_daytime']
    
    return df

def add_lag_features(df, lag_features, intervals):
    df = df.sort_values('datetime').reset_index(drop=True)
    for feat in lag_features:
        for lag_min, lag_steps in intervals.items():
            lag_col = f"{feat}_{lag_min}"
            df[lag_col] = df[feat].shift(lag_steps)
            df[lag_col] = df[lag_col].fillna(0)
    return df

def add_size_features(df):
    df['size_change_30'] = df['size'] - df['size_30']

    return df

def latlon_to_xy(df, lat_ref = -13.3 , lon_ref = 48.3 ):
    R = 6371.0  # Earth radius in kilometers
    rad = np.pi/180.0
    
    delta_lat = (df['lat'] - lat_ref) * rad
    delta_lon = (df['lon'] - lon_ref) * rad
    
    df['distance_y'] = delta_lat * R
    df['distance_x'] = delta_lon * R * np.cos(lat_ref * rad)

    df['radial_distance'] = np.sqrt(df['distance_x']**2 + df['distance_y']**2)
    df['bearing'] = np.arctan2(df['distance_y'], df['distance_x'])
    df['intensity_distance_interaction'] = df['radial_distance'] * df['intensity']
    return df


In [88]:
# Apply feature engineering
train_df = create_time_features(train_df)
test_df = create_time_features(test_df)

train_df = create_spatial_features(train_df)
test_df = create_spatial_features(test_df)

train_df = create_storm_features(train_df)
test_df = create_storm_features(test_df)

train_df = latlon_to_xy(train_df)
test_df = latlon_to_xy(test_df)

# Define lag features and intervals
lag_features = ['intensity', 'size', 'distance', 'intensity_density', 'minute_sin', 'minute_cos', 'bearing']
lag_intervals = {30: 2, 60: 4}  # 30min -> 2 steps, 60min -> 4 steps

train_df = add_lag_features(train_df, lag_features, lag_intervals)
test_df = add_lag_features(test_df, lag_features, lag_intervals)

train_df = add_size_features(train_df)
test_df = add_size_features(test_df)



# Prepare Training data
feature_cols = [
    'hour_sin', 'hour_cos', 
    'distance_x', 'distance_y', 'intensity', 'size', 'distance',
    'is_peak_cyclone_season', 
    'cyclone_season_weight', 'peak_cyclone_daytime_interaction', 'size_change_30', 'bearing'
]

# Add lag columns to feature_cols
for feat in lag_features:
    for lag_min in lag_intervals.keys():
        feature_cols.append(f"{feat}_{lag_min}")

X = train_df[feature_cols]
y_1h = train_df['Storm_NosyBe_1h']
y_3h = train_df['Storm_NosyBe_3h']

# Split training and validation sets
X_train, X_val, y1h_train, y1h_val = train_test_split(X, y_1h, test_size=0.2, random_state=42)
_, _, y3h_train, y3h_val = train_test_split(X, y_3h, test_size=0.2, random_state=42)

In [89]:
train_df['bearing']

0       -1.791004
1       -1.667355
2       -1.586907
3       -1.529838
4        0.015710
           ...   
51072   -2.380624
51073   -2.432236
51074   -2.492904
51075   -2.558417
51076   -2.592845
Name: bearing, Length: 51077, dtype: float64

In [93]:
# Prepare test features
X_test = test_df[feature_cols]

##############################################
# 2) XGBoost Models
##############################################
# We'll use eval_metric='logloss' for XGB

model_1h_xgb = XGBClassifier(n_estimators=1500, learning_rate=0.008, max_depth=4, random_state=21, use_label_encoder=False,
                 eval_metric='logloss', early_stopping_rounds=50,)
model_3h_xgb = XGBClassifier(n_estimators=1500, learning_rate=0.008, max_depth=4, random_state=21, use_label_encoder=False,
                 eval_metric='logloss', early_stopping_rounds=50,)

model_1h_xgb.fit(X_train, y1h_train,
                 eval_set=[(X_val, y1h_val)],             
                 verbose=1)
pred_1h = model_1h_xgb.predict_proba(X_train)[:, 1]
X_train_2 = X_train.copy()
X_train_2['pred_1h'] = pred_1h

model_3h_xgb.fit(X_train, y3h_train,
                 eval_set=[(X_val, y3h_val)],
                 verbose=1)

[0]	validation_0-logloss:0.27035
[1]	validation_0-logloss:0.26912
[2]	validation_0-logloss:0.26790
[3]	validation_0-logloss:0.26671
[4]	validation_0-logloss:0.26554
[5]	validation_0-logloss:0.26437
[6]	validation_0-logloss:0.26323
[7]	validation_0-logloss:0.26211
[8]	validation_0-logloss:0.26099
[9]	validation_0-logloss:0.25988
[10]	validation_0-logloss:0.25880
[11]	validation_0-logloss:0.25772
[12]	validation_0-logloss:0.25668
[13]	validation_0-logloss:0.25564
[14]	validation_0-logloss:0.25461
[15]	validation_0-logloss:0.25361
[16]	validation_0-logloss:0.25261
[17]	validation_0-logloss:0.25162
[18]	validation_0-logloss:0.25065
[19]	validation_0-logloss:0.24970
[20]	validation_0-logloss:0.24874
[21]	validation_0-logloss:0.24780
[22]	validation_0-logloss:0.24687
[23]	validation_0-logloss:0.24596
[24]	validation_0-logloss:0.24506
[25]	validation_0-logloss:0.24416
[26]	validation_0-logloss:0.24329
[27]	validation_0-logloss:0.24242
[28]	validation_0-logloss:0.24156
[29]	validation_0-loglos

Parameters: { "use_label_encoder" } are not used.



[57]	validation_0-logloss:0.22031
[58]	validation_0-logloss:0.21967
[59]	validation_0-logloss:0.21905
[60]	validation_0-logloss:0.21843
[61]	validation_0-logloss:0.21783
[62]	validation_0-logloss:0.21722
[63]	validation_0-logloss:0.21663
[64]	validation_0-logloss:0.21604
[65]	validation_0-logloss:0.21547
[66]	validation_0-logloss:0.21489
[67]	validation_0-logloss:0.21432
[68]	validation_0-logloss:0.21375
[69]	validation_0-logloss:0.21319
[70]	validation_0-logloss:0.21264
[71]	validation_0-logloss:0.21210
[72]	validation_0-logloss:0.21155
[73]	validation_0-logloss:0.21102
[74]	validation_0-logloss:0.21049
[75]	validation_0-logloss:0.20997
[76]	validation_0-logloss:0.20946
[77]	validation_0-logloss:0.20894
[78]	validation_0-logloss:0.20843
[79]	validation_0-logloss:0.20791
[80]	validation_0-logloss:0.20738
[81]	validation_0-logloss:0.20687
[82]	validation_0-logloss:0.20636
[83]	validation_0-logloss:0.20586
[84]	validation_0-logloss:0.20536
[85]	validation_0-logloss:0.20488
[86]	validatio

Parameters: { "use_label_encoder" } are not used.



[58]	validation_0-logloss:0.22684
[59]	validation_0-logloss:0.22645
[60]	validation_0-logloss:0.22605
[61]	validation_0-logloss:0.22566
[62]	validation_0-logloss:0.22528
[63]	validation_0-logloss:0.22489
[64]	validation_0-logloss:0.22452
[65]	validation_0-logloss:0.22415
[66]	validation_0-logloss:0.22377
[67]	validation_0-logloss:0.22340
[68]	validation_0-logloss:0.22304
[69]	validation_0-logloss:0.22268
[70]	validation_0-logloss:0.22233
[71]	validation_0-logloss:0.22198
[72]	validation_0-logloss:0.22163
[73]	validation_0-logloss:0.22129
[74]	validation_0-logloss:0.22095
[75]	validation_0-logloss:0.22061
[76]	validation_0-logloss:0.22027
[77]	validation_0-logloss:0.21994
[78]	validation_0-logloss:0.21961
[79]	validation_0-logloss:0.21927
[80]	validation_0-logloss:0.21896
[81]	validation_0-logloss:0.21865
[82]	validation_0-logloss:0.21834
[83]	validation_0-logloss:0.21803
[84]	validation_0-logloss:0.21771
[85]	validation_0-logloss:0.21739
[86]	validation_0-logloss:0.21708
[87]	validatio

In [94]:
pred_1h_xgb = model_1h_xgb.predict_proba(X_test)[:, 1]
pred_3h_xgb = model_3h_xgb.predict_proba(X_test)[:, 1]

submission_xgb = pd.DataFrame({
    'storm_id': test_df['storm_id'],
    'Storm_NosyBe_1h': pred_1h_xgb,
    'Storm_NosyBe_3h': pred_3h_xgb
})
submission_xgb.to_csv('./output/submission_xgb_102.csv', index=False)