In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ahmeduzaki/global-earthquake-tsunami-risk-assessment-dataset")

print("Path to dataset files:", path) #be saved outside of the current session

In [None]:
import pandas as pd
df = pd.read_csv('/kaggle/input/global-earthquake-tsunami-risk-assessment-dataset/earthquake_data_tsunami.csv')
print("Complete")

X= df[['magnitude', 'cdi', 'mmi', 'sig', 'nst', 'dmin', 'gap', 'depth',
                 'latitude', 'longitude', 'Year', 'Month']]
y = 'tsunami'

train_df, test_df = train_test_split(
    df, test_size=0.25, random_state=42
)


In [None]:
class CustomDatasetFromDataFrame(Dataset):
    def __init__(self, dataframe, features_cols, target_col, mean=None, std=None):
        # Change to float32 explicitly
        self.features = torch.tensor(dataframe[features_cols].values, dtype=torch.float32)
        self.targets = torch.tensor(dataframe[target_col].values, dtype=torch.long)
        
        if mean is not None and std is not None:
            # Ensure mean and std are also float32
            mean = torch.tensor(mean.values, dtype=torch.float32)
            std = torch.tensor(std.values, dtype=torch.float32)
            self.features = (self.features - mean) / (std + 1e-8)
    
    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

In [None]:
import pandas as pd
import numpy as np

# Assuming you have already created your train_df
# train_df, test_df = train_test_split(df, ...)

# Select the feature columns from the training data
features_cols = ['magnitude', 'cdi', 'mmi', 'sig', 'nst', 'dmin', 'gap', 'depth',
                 'latitude', 'longitude', 'Year', 'Month']

# Calculate mean and standard deviation for each feature
train_mean = train_df[features_cols].mean()
train_std = train_df[features_cols].std()

# Handle cases where standard deviation might be zero (for constant features)
# Replace zero std with 1 to avoid division by zero
train_std[train_std == 0] = 1


In [None]:
# 1. FIRST: Define columns
features_cols = ['magnitude', 'cdi', 'mmi', 'sig', 'nst', 'dmin', 'gap', 'depth',
                 'latitude', 'longitude', 'Year', 'Month']
target_col = 'tsunami'

# 2. SECOND: Split the data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df[target_col])

# 3. THIRD: Calculate mean and std from training data
train_mean = train_df[features_cols].mean()
train_std = train_df[features_cols].std()

# 4. FINALLY: Create the datasets
train_dataset = CustomDatasetFromDataFrame(
    train_df, 
    features_cols, 
    target_col, 
    mean=train_mean,
    std=train_std
)
test_dataset = CustomDatasetFromDataFrame(
    test_df, 
    features_cols, 
    target_col, 
    mean=train_mean, 
    std=train_std
)

print(f"Training samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")
       

In [None]:
# Rerun the verification loop
print("First 5 samples from PyTorch training dataset (after fix):")
for i in range(5):
    features, targets = train_dataset[i]
    print(f"Sample {i}:")
    print(f"  Features: {features}")
    print(f"  Target: {targets}")
    print("-" * 20)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np


class TsunamiNN(nn.Module):
    def __init__(self, input_size, hidden_sizes, num_classes):
        super(TsunamiNN, self).__init__()
        
        layers = []
        prev_size = input_size
        
        # Create hidden layers
        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(prev_size, hidden_size))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.3)) 
            prev_size = hidden_size

        layers.append(nn.Linear(prev_size, num_classes))
        
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)


def train_model(model, train_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for features, labels in train_loader:
        features, labels = features.to(device), labels.to(device)
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        # Statistics
        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    epoch_loss = running_loss / len(train_loader)
    epoch_acc = 100 * correct / total
    return epoch_loss, epoch_acc

def evaluate_model(model, test_loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for features, labels in test_loader:
            features, labels = features.to(device), labels.to(device)
            
            outputs = model(features)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    epoch_loss = running_loss / len(test_loader)
    epoch_acc = 100 * correct / total
    return epoch_loss, epoch_acc

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

input_size = 12  # Number of features
hidden_sizes = [64, 32, 16]  # Hidden layer sizes
num_classes = 2  # Binary classification (tsunami or not)

model = TsunamiNN(input_size, hidden_sizes, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 50
best_test_acc = 0.0

print("Starting training...")
for epoch in range(num_epochs):
    train_loss, train_acc = train_model(model, train_loader, criterion, optimizer, device)
    test_loss, test_acc = evaluate_model(model, test_loader, criterion, device)
    
    # Save best model
    if test_acc > best_test_acc:
        best_test_acc = test_acc
        torch.save(model.state_dict(), 'best_tsunami_model.pth')
    
    if (epoch + 1) % 5 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}]')
        print(f'  Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%')
        print(f'  Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%')

print(f'\nBest Test Accuracy: {best_test_acc:.2f}%')