In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/train.csv")
test=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/test.csv")
sub=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/sample_submission.csv")

In [None]:
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import random

# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Drop unneeded columns and handle missing values
train = train.drop(columns=["Unnamed: 12", "id"])  # Dropping unnecessary columns

# Find common columns between train and test (excluding "DIC" from train)
common_columns = train.drop(columns=["DIC"]).columns.intersection(test.columns)

# Select the common columns for both train and test
X = train[common_columns]
y = train["DIC"]
test = test[common_columns]

# Normalize X and y separately
scaler_X = StandardScaler()
# scaler_y = StandardScaler()

X_scaled = scaler_X.fit_transform(X)
# y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()
test_scaled = scaler_X.transform(test)

# Split the training data
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=SEED)

In [None]:
from torch.utils.data import Dataset, DataLoader

class OceanChemistryDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = OceanChemistryDataset(X_train, y_train)
val_dataset = OceanChemistryDataset(X_val, y_val)

# Set generator for reproducible shuffling
g = torch.Generator()
g.manual_seed(SEED)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, generator=g)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [None]:
class MLPModel(nn.Module):
    def __init__(self, input_size):
        super(MLPModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(64, 1)  # Output layer for regression

    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x

# Initialize the model
model = MLPModel(input_size=X_train.shape[1])
print(X_train.shape)

In [None]:
import torch.optim as optim

# Loss function and optimizer
criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training function
def train_model(model, train_loader, val_loader, epochs=5000):
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0

        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs.squeeze(), y_batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        val_loss = 0.0
        model.eval()
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                outputs = model(X_batch)
                loss = criterion(outputs.squeeze(), y_batch)
                val_loss += loss.item()

        if epoch % 100 == 0:
            print(f"Epoch {epoch+1}/{epochs}, Train Loss: {running_loss/len(train_loader)}, Validation Loss: {val_loss/len(val_loader)}")

# Train the model
train_model(model, train_loader, val_loader, epochs=5000)


Epoch 1/5000, Train Loss: 4633887.947368421, Validation Loss: 4612593.2
Epoch 101/5000, Train Loss: 15084.370579769737, Validation Loss: 18933.8005859375
Epoch 201/5000, Train Loss: 2547.9099185341283, Validation Loss: 3594.0954345703126
Epoch 301/5000, Train Loss: 716.5324530350534, Validation Loss: 948.5962158203125
Epoch 401/5000, Train Loss: 400.37651142321135, Validation Loss: 575.134521484375
Epoch 501/5000, Train Loss: 284.9509711014597, Validation Loss: 413.65491943359376
Epoch 601/5000, Train Loss: 157.120555676912, Validation Loss: 215.35331115722656
Epoch 701/5000, Train Loss: 115.85318153782895, Validation Loss: 136.43557891845703
Epoch 801/5000, Train Loss: 78.71800462823165, Validation Loss: 102.47501983642579
Epoch 901/5000, Train Loss: 79.39923437018143, Validation Loss: 84.50692291259766
Epoch 1001/5000, Train Loss: 61.57915757831774, Validation Loss: 90.80420532226563
Epoch 1101/5000, Train Loss: 59.011202460841126, Validation Loss: 79.94369735717774
Epoch 1201/5000, 

In [None]:
# Convert the test set into a torch tensor
test_tensor = torch.tensor(test_scaled, dtype=torch.float32)

# Set the model to evaluation mode
model.eval()

# Make predictions
with torch.no_grad():
    predictions_scaled = model(test_tensor).squeeze().numpy()
    # Inverse transform to get actual DIC values
    # predictions = scaler_y.inverse_transform(predictions_scaled.reshape(-1, 1)).flatten()

# Prepare submission
submission = pd.DataFrame({"id": range(1455, 1455 + len(predictions)), "DIC": predictions})
submission.to_csv("submission.csv", index=False)

In [None]:
# score: 5.98852
# name: 杉浦孔明
# id: 123456

Python 3.10.14
