In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Load dataset
df = pd.read_csv('/kaggle/input/dataset-renamed/dataset_renamed.csv')  # Replace with your CSV path

# Drop geometry column
df = df.drop('geometry', axis=1)

# Fill missing values
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

# Encode categorical variables
le_land = LabelEncoder()
le_soil = LabelEncoder()
le_state = LabelEncoder()

df['Land Cover'] = le_land.fit_transform(df['Land Cover'])
df['Soil Type'] = le_soil.fit_transform(df['Soil Type'])
df['State'] = le_state.fit_transform(df['State'])

# Target encoding for District
mean_flood_by_district = df.groupby('District')['Flood Occurred'].mean()
df['District_encoded'] = df['District'].map(mean_flood_by_district)
df = df.drop('District', axis=1)

# Features and target
X = df.drop('Flood Occurred', axis=1)
y = df['Flood Occurred']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optional scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ==========================
# XGBoost Hyperparameter Grid
# ==========================
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'scale_pos_weight': [1, y_train.value_counts()[0]/y_train.value_counts()[1]]  # balance classes
}

xgb = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)

# GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=2
)

# Fit Grid Search
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score (CV):", grid_search.best_score_)

# Train best estimator on full training data
best_xgb = grid_search.best_estimator_
y_train_pred = best_xgb.predict(X_train)
y_test_pred = best_xgb.predict(X_test)

# ==========================
# Training Metrics
# ==========================
print("\n=== Training Metrics ===")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("Precision:", precision_score(y_train, y_train_pred))
print("Recall:", recall_score(y_train, y_train_pred))
print("F1 Score:", f1_score(y_train, y_train_pred))

# ==========================
# Testing Metrics
# ==========================
print("\n=== Test Metrics ===")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("Precision:", precision_score(y_test, y_test_pred))
print("Recall:", recall_score(y_test, y_test_pred))
print("F1 Score:", f1_score(y_test, y_test_pred))

# Confusion matrix
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))


Fitting 5 folds for each of 486 candidates, totalling 2430 fits
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=100, scale_pos_weight=1, subsample=0.7; total time=   0.7s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=100, scale_pos_weight=1, subsample=0.8; total time=   0.6s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=100, scale_pos_weight=1, subsample=0.8; total time=   0.7s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=100, scale_pos_weight=1, subsample=1.0; total time=   0.6s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=100, scale_pos_weight=0.9742362173634075, subsample=0.7; total time=   0.7s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=100, scale_pos_weight=0.9742362173634075, subsample=0.8; total time=   0.6s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=100, scal

KeyboardInterrupt: 

In [2]:
# ==========================
# Import libraries
# ==========================
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# ==========================
# Load dataset
# ==========================
df = pd.read_csv('/kaggle/input/dataset-renamed/dataset_renamed.csv')  # Replace with your CSV path

# Drop geometry column
df = df.drop('geometry', axis=1)

# Fill missing values
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

# ==========================
# Encode categorical variables
# ==========================
le_land = LabelEncoder()
le_soil = LabelEncoder()
le_state = LabelEncoder()

df['Land Cover'] = le_land.fit_transform(df['Land Cover'])
df['Soil Type'] = le_soil.fit_transform(df['Soil Type'])
df['State'] = le_state.fit_transform(df['State'])

# Target encoding for District
mean_flood_by_district = df.groupby('District')['Flood Occurred'].mean()
df['District_encoded'] = df['District'].map(mean_flood_by_district)
df = df.drop('District', axis=1)

# ==========================
# Features and target
# ==========================
X = df.drop('Flood Occurred', axis=1)
y = df['Flood Occurred']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optional scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ==========================
# XGBoost with GPU + Hyperparameter Tuning
# ==========================
param_grid_small = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'scale_pos_weight': [1, y_train.value_counts()[0]/y_train.value_counts()[1]]  # balance classes
}

xgb = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42,
    tree_method='hist',
    device='cuda'
)

grid_search_small = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid_small,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=2
)

# Fit Grid Search
grid_search_small.fit(X_train, y_train)

# Best parameters and CV score
print("Best Parameters:", grid_search_small.best_params_)
print("Best F1 Score (CV):", grid_search_small.best_score_)

# ==========================
# Train best estimator
# ==========================
best_xgb = grid_search_small.best_estimator_
y_train_pred = best_xgb.predict(X_train)
y_test_pred = best_xgb.predict(X_test)

# ==========================
# Training Metrics
# ==========================
print("\n=== Training Metrics ===")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("Precision:", precision_score(y_train, y_train_pred))
print("Recall:", recall_score(y_train, y_train_pred))
print("F1 Score:", f1_score(y_train, y_train_pred))

# ==========================
# Testing Metrics
# ==========================
print("\n=== Test Metrics ===")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("Precision:", precision_score(y_test, y_test_pred))
print("Recall:", recall_score(y_test, y_test_pred))
print("F1 Score:", f1_score(y_test, y_test_pred))

# Confusion matrix and classification report
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))


Fitting 5 folds for each of 64 candidates, totalling 320 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, n_estimators=100, scale_pos_weight=1, subsample=0.8; total time=   1.9s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, n_estimators=100, scale_pos_weight=1, subsample=1.0; total time=   1.4s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, n_estimators=100, scale_pos_weight=1, subsample=1.0; total time=   1.4s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, n_estimators=100, scale_pos_weight=0.9742362173634075, subsample=0.8; total time=   1.4s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, n_estimators=100, scale_pos_weight=0.9742362173634075, subsample=1.0; total time=   1.3s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, n_estimators=200, scale_pos_weight=1, subsample=0.8; total time=   2.6s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, n_estimators=200, scale_pos_weight=1, subsample=1.0; total time=   2.3s
[CV] END colsa

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


