In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e3/sample_submission.csv
/kaggle/input/playground-series-s5e3/train.csv
/kaggle/input/playground-series-s5e3/test.csv


In [None]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Data handling
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_selection import RFE

# Handling class imbalance
from imblearn.over_sampling import SMOTE

# Models
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Hyperparameter optimization
import optuna

# Metrics
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

# Saving models
import joblib


In [None]:
# Load the datasets
df_train = pd.read_csv('/kaggle/input/playground-series-s5e3/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s5e3/test.csv')

# Quick look at train dataset
print("Train Dataset Head:")
display(df_train.head())

print("\nTrain Dataset Info:")
print(df_train.info())

# Check for missing values
print("\nMissing values in train:")
print(df_train.isnull().sum())

print("\nTrain dataset shape:", df_train.shape)
print("Test dataset shape:", df_test.shape)

# Check target variable distribution
print("\nTarget distribution (rainfall):")
print(df_train['rainfall'].value_counts(normalize=True))

# Check test dataset columns
print("\nTest Dataset Head:")
display(df_test.head())

print("\nMissing values in test:")
print(df_test.isnull().sum())


In [None]:
# Rename column in train and test
df_train.rename(columns={'temparature': 'temperature'}, inplace=True)
df_test.rename(columns={'temparature': 'temperature'}, inplace=True)


In [None]:
# Check and fill missing values in winddirection for test dataset
if df_test['winddirection'].isnull().sum() > 0:
    median_wind_dir = df_test['winddirection'].median()
    df_test['winddirection'].fillna(median_wind_dir, inplace=True)


In [None]:
# Confirm target imbalance before SMOTE
rainfall_dist = df_train['rainfall'].value_counts(normalize=True)
print(rainfall_dist)

# We'll handle with SMOTE after feature engineering and splitting


In [None]:
# Create sine and cosine transformations for the 'day' feature
df_train['day_sin'] = np.sin(2 * np.pi * df_train['day'] / 365)
df_train['day_cos'] = np.cos(2 * np.pi * df_train['day'] / 365)

df_test['day_sin'] = np.sin(2 * np.pi * df_test['day'] / 365)
df_test['day_cos'] = np.cos(2 * np.pi * df_test['day'] / 365)


In [None]:
df_train['temp_diff'] = df_train['maxtemp'] - df_train['mintemp']
df_test['temp_diff'] = df_test['maxtemp'] - df_test['mintemp']


In [None]:
df_train['humidity_index'] = df_train['humidity'] / (df_train['temperature'] + 1)  # Add 1 to avoid division by zero
df_test['humidity_index'] = df_test['humidity'] / (df_test['temperature'] + 1)


In [None]:
df_train['windspeed_category'] = pd.cut(df_train['windspeed'], bins=[-0.1, 20, 40, np.inf], labels=['Low', 'Medium', 'High'])
df_test['windspeed_category'] = pd.cut(df_test['windspeed'], bins=[-0.1, 20, 40, np.inf], labels=['Low', 'Medium', 'High'])


In [None]:
df_train = pd.get_dummies(df_train, columns=['windspeed_category'], prefix='windspeed_cat')
df_test = pd.get_dummies(df_test, columns=['windspeed_category'], prefix='windspeed_cat')


In [None]:
df_train.drop(columns=['id', 'day'], inplace=True)
df_test.drop(columns=['id', 'day'], inplace=True)


In [None]:
# Add any missing columns in df_test with zeros
missing_cols = set(df_train.columns) - set(df_test.columns)
missing_cols.discard('rainfall')  # Target shouldn't be added
for col in missing_cols:
    df_test[col] = 0

# Drop extra columns in df_test
extra_cols = set(df_test.columns) - set(df_train.columns)
for col in extra_cols:
    df_test.drop(columns=[col], inplace=True)

# Reorder test columns to match training set (except target)
X_train_full = df_train.drop(columns=['rainfall'])
df_test = df_test[X_train_full.columns]


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Select continuous features for scaling
continuous_features = ['pressure', 'maxtemp', 'temperature', 'mintemp', 'dewpoint',
                       'humidity', 'cloud', 'sunshine', 'winddirection', 'windspeed',
                       'temp_diff', 'humidity_index']

scaler = MinMaxScaler()

# Fit on training data and transform both train and test
df_train[continuous_features] = scaler.fit_transform(df_train[continuous_features])
df_test[continuous_features] = scaler.transform(df_test[continuous_features])


In [None]:
from imblearn.over_sampling import SMOTE

# Define features (X) and target (y)
X = df_train.drop(columns=['rainfall'])
y = df_train['rainfall']

# Apply SMOTE to create a balanced dataset
smote = SMOTE(random_state=42, sampling_strategy=0.7)  # Do not oversample to 1.0 to avoid overfitting
X_resampled, y_resampled = smote.fit_resample(X, y)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_resampled, y_resampled,
    test_size=0.2,
    stratify=y_resampled,
    random_state=42
)


In [None]:
print("Training Set Distribution:\n", y_train.value_counts(normalize=True))
print("Validation Set Distribution:\n", y_val.value_counts(normalize=True))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define parameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [300, 500, 700],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize Random Forest model
rf_base_model = RandomForestClassifier(random_state=42)

# Grid search for Random Forest
rf_grid_search = GridSearchCV(estimator=rf_base_model, param_grid=rf_param_grid, 
                              cv=3, n_jobs=-1, scoring='roc_auc', verbose=2)

# Fit the grid search
rf_grid_search.fit(X_train, y_train)

# Save the best tuned model
rf_model_tuned = rf_grid_search.best_estimator_
print("✅ Random Forest tuning complete. Best params: ", rf_grid_search.best_params_)


In [None]:
rf_val_preds = rf_model_tuned.predict(X_val)
rf_val_proba = rf_model_tuned.predict_proba(X_val)[:, 1]
rf_accuracy = accuracy_score(y_val, rf_val_preds)
rf_roc_auc = roc_auc_score(y_val, rf_val_proba)


In [None]:
import optuna
from xgboost import XGBClassifier

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 300, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5)
    }
    model = XGBClassifier(**params, objective='binary:logistic', 
                          eval_metric='logloss', use_label_encoder=False, random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_val)[:,1]
    return roc_auc_score(y_val, preds)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

best_params_xgb = study.best_params
print("✅ XGBoost Optuna best params: ", best_params_xgb)

# Train final tuned XGBoost model
xgb_model_tuned = XGBClassifier(**best_params_xgb, objective='binary:logistic', 
                                eval_metric='logloss', use_label_encoder=False, random_state=42)
xgb_model_tuned.fit(X_train, y_train)


In [None]:
xgb_val_preds = xgb_model_tuned.predict(X_val)
xgb_val_proba = xgb_model_tuned.predict_proba(X_val)[:, 1]
xgb_accuracy = accuracy_score(y_val, xgb_val_preds)
xgb_roc_auc = roc_auc_score(y_val, xgb_val_proba)


In [None]:
from lightgbm import LGBMClassifier

# Define parameter grid for LightGBM
lgbm_param_grid = {
    'n_estimators': [300, 500, 800],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [5, 10, 15],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

lgbm_base_model = LGBMClassifier(random_state=42)

# Grid search for LightGBM
lgbm_grid_search = GridSearchCV(estimator=lgbm_base_model, param_grid=lgbm_param_grid,
                                cv=3, n_jobs=-1, scoring='roc_auc', verbose=2)

lgbm_grid_search.fit(X_train, y_train)

# Save tuned LightGBM model
lgbm_model_tuned = lgbm_grid_search.best_estimator_
print("✅ LightGBM tuning complete. Best params: ", lgbm_grid_search.best_params_)


In [None]:
lgbm_val_preds = lgbm_model_tuned.predict(X_val)
lgbm_val_proba = lgbm_model_tuned.predict_proba(X_val)[:, 1]
lgbm_accuracy = accuracy_score(y_val, lgbm_val_preds)
lgbm_roc_auc = roc_auc_score(y_val, lgbm_val_proba)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

stacking_model = StackingClassifier(
    estimators=[
        ('rf', rf_model_tuned),
        ('xgb', xgb_model_tuned),
        ('lgbm', lgbm_model_tuned)
    ],
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5, n_jobs=-1
)

stacking_model.fit(X_train, y_train)
print("✅ Stacking model training complete.")


In [None]:
stack_val_preds = stacking_model.predict(X_val)
stack_val_proba = stacking_model.predict_proba(X_val)[:, 1]
stack_accuracy = accuracy_score(y_val, stack_val_preds)
stack_roc_auc = roc_auc_score(y_val, stack_val_proba)


In [None]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Random Forest evaluation
rf_val_preds = rf_model_tuned.predict(X_val)
rf_val_proba = rf_model_tuned.predict_proba(X_val)[:, 1]
rf_accuracy = accuracy_score(y_val, rf_val_preds)
rf_roc_auc = roc_auc_score(y_val, rf_val_proba)

print(f"✅ Random Forest Accuracy: {rf_accuracy:.4f}")
print(f"✅ Random Forest ROC-AUC: {rf_roc_auc:.4f}")
print("Random Forest Classification Report:")
print(classification_report(y_val, rf_val_preds))


In [None]:
models_comparison = {
    'Random Forest': {'Accuracy': rf_accuracy, 'ROC_AUC': rf_roc_auc},
    'XGBoost': {'Accuracy': xgb_accuracy, 'ROC_AUC': xgb_roc_auc},
    'LightGBM': {'Accuracy': lgbm_accuracy, 'ROC_AUC': lgbm_roc_auc},
    'Stacking Model': {'Accuracy': stack_accuracy, 'ROC_AUC': stack_roc_auc}
}

comparison_df = pd.DataFrame(models_comparison).T
print("✅ Models Performance Comparison:")
display(comparison_df)

best_model_name = comparison_df['ROC_AUC'].idxmax()
print(f"🚀 The best model to use for test prediction is: **{best_model_name}**")


In [None]:
# Align test features to match training columns (excluding rainfall)
X_test_final = df_test[X_train.columns]
print(f"✅ Test feature shape after alignment: {X_test_final.shape}")


In [None]:
# If id column not present, re-load test dataset ID column or extract from original test set
if 'id' not in df_test.columns:
    # Reload test data to retrieve ID
    original_test = pd.read_csv('/kaggle/input/playground-series-s5e3/test.csv')
    ids = original_test['id']
else:
    ids = df_test['id']


In [None]:
X_test_final = df_test[X_train.columns]
test_predictions = stacking_model.predict(X_test_final)


In [None]:
submission = pd.DataFrame({
    'id': ids,
    'rainfall': test_predictions
})

submission.head()


In [None]:
submission.to_csv('/kaggle/working/final_stacking_submission.csv', index=False)
print("✅ Final submission file saved and ready for upload!")
