In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/train.csv")
test=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/test.csv")
sub=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/sample_submission.csv")

In [None]:
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import random

# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Drop unneeded columns and handle missing values
train = train.drop(columns=["Unnamed: 12", "id"])  # Dropping unnecessary columns

# CRITICAL FIX: Rename TA1.x to TA1 to match test data
train = train.rename(columns={"TA1.x": "TA1"})

# Find common columns between train and test (excluding "DIC" from train)
common_columns = train.drop(columns=["DIC"]).columns.intersection(test.columns)

print(f"Common columns: {len(common_columns)}")
print(f"Common columns list: {sorted(common_columns.tolist())}")

# Select the common columns for both train and test
X = train[common_columns]
y = train["DIC"]
test = test[common_columns]

# Normalize X and y separately
scaler_X = StandardScaler()
# scaler_y = StandardScaler()

X_scaled = scaler_X.fit_transform(X)
# y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()
test_scaled = scaler_X.transform(test)

# Split the training data
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=SEED)

In [None]:
from torch.utils.data import Dataset, DataLoader

class OceanChemistryDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = OceanChemistryDataset(X_train, y_train)
val_dataset = OceanChemistryDataset(X_val, y_val)

# Set generator for reproducible shuffling
g = torch.Generator()
g.manual_seed(SEED)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, generator=g)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)

In [None]:
class MLPModel(nn.Module):
    def __init__(self, input_size, hidden_layers=[128, 64]):
        super(MLPModel, self).__init__()
        
        # Create layers dynamically
        self.layers = nn.ModuleList()
        self.batch_norms = nn.ModuleList()
        self.activations = nn.ModuleList()
        
        # Input layer
        prev_size = input_size
        
        # Hidden layers
        for hidden_size in hidden_layers:
            # Linear layer
            fc = nn.Linear(prev_size, hidden_size)
            # He initialization for ReLU
            nn.init.kaiming_normal_(fc.weight, mode='fan_in', nonlinearity='relu')
            nn.init.constant_(fc.bias, 0)
            
            self.layers.append(fc)
            self.batch_norms.append(nn.BatchNorm1d(hidden_size))
            self.activations.append(nn.ReLU())
            
            prev_size = hidden_size
        
        # Output layer
        self.output = nn.Linear(prev_size, 1)
        # Xavier initialization for output layer
        nn.init.xavier_normal_(self.output.weight)
        nn.init.constant_(self.output.bias, 0)

    def forward(self, x):
        # Hidden layers with BatchNorm and ReLU
        for fc, bn, activation in zip(self.layers, self.batch_norms, self.activations):
            x = fc(x)
            x = bn(x)
            x = activation(x)
        
        # Output layer
        x = self.output(x)
        return x

# Initialize the model with custom hidden layer sizes
hidden_layers = [128, 64]  # You can change this to [256, 128, 64] or any configuration
model = MLPModel(input_size=X_train.shape[1], hidden_layers=hidden_layers)
print(f"Input size: {X_train.shape[1]}")
print(f"Hidden layers: {hidden_layers}")
print(f"Model architecture: {X_train.shape[1]} -> {' -> '.join(map(str, hidden_layers))} -> 1")

In [None]:
import torch.optim as optim

# Loss function and optimizer
criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training function
def train_model(model, train_loader, val_loader, epochs=5000):
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        train_predictions = []
        train_targets = []

        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs.squeeze(), y_batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            
            # Store predictions and targets for RMSE calculation
            train_predictions.extend(outputs.squeeze().detach().numpy())
            train_targets.extend(y_batch.numpy())

        # Calculate train RMSE
        train_rmse = np.sqrt(np.mean((np.array(train_predictions) - np.array(train_targets))**2))
        
        val_loss = 0.0
        val_predictions = []
        val_targets = []
        model.eval()
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                outputs = model(X_batch)
                loss = criterion(outputs.squeeze(), y_batch)
                val_loss += loss.item()
                
                # Store predictions and targets for RMSE calculation
                val_predictions.extend(outputs.squeeze().numpy())
                val_targets.extend(y_batch.numpy())
        
        # Calculate validation RMSE
        val_rmse = np.sqrt(np.mean((np.array(val_predictions) - np.array(val_targets))**2))

        if epoch % 100 == 0:
            print(f"Epoch {epoch+1}/{epochs} | "
                  f"Train Loss: {running_loss/len(train_loader):.2f}, Train RMSE: {train_rmse:.2f} | "
                  f"Val Loss: {val_loss/len(val_loader):.2f}, Val RMSE: {val_rmse:.2f}")

# Train the model
train_model(model, train_loader, val_loader, epochs=5000)

In [None]:
# Convert the test set into a torch tensor
test_tensor = torch.tensor(test_scaled, dtype=torch.float32)

# Set the model to evaluation mode
model.eval()

# Make predictions
with torch.no_grad():
    predictions = model(test_tensor).squeeze().numpy()

# Prepare submission
submission = pd.DataFrame({"id": range(1455, 1455 + len(predictions)), "DIC": predictions})
submission.to_csv("submission.csv", index=False)