In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import glob

In [2]:


# load and merge data
def load_and_merge_data(train_path):
    all_files = glob.glob(str(Path(train_path) / "*.csv"))
    df_list = []

    for file in all_files:
        df = pd.read_csv(file)
        df_list.append(df)

    merged_df = pd.concat(df_list, ignore_index=True)
    return merged_df

# handle missing values
def handle_missing_values(df):
    # numeric features
    numeric_features = [
        'temperature',
        'wind_speed',
        'visibility',
        'cloud_ceiling',
        'wind_direction',
        'wind_gust',
        'precip',
        'arrival_count'
    ]

    # boolean features
    boolean_features = [
        'cloud_BK',
        'cloud_CL',
        'cloud_FW',
        'cloud_OV',
        'cloud_SC',
        'lightning_prob_H',
        'lightning_prob_L',
        'lightning_prob_M',
        'lightning_prob_N'
    ]

    # fill numeric features with median
    for feature in numeric_features:
        if feature in df.columns:
            df[feature] = df[feature].fillna(df[feature].median())

    # fill boolean features
    for feature in boolean_features:
        if feature in df.columns:
            df[feature] = df[feature].fillna(False)

    return df

# prepare dataset
def prepare_dataset(data_path, output_dir='processed_data', output_file='processed_train.csv'):
    # load and merge data
    print("loading data...")
    df = load_and_merge_data(data_path)

    # select specified features
    weather_features = [
        'temperature',
        'wind_speed',
        'visibility',
        'cloud_ceiling',
        'wind_direction',
        'wind_gust',
        'precip',
        'cloud_BK',
        'cloud_CL',
        'cloud_FW',
        'cloud_OV',
        'cloud_SC',
        'lightning_prob_H',
        'lightning_prob_L',
        'lightning_prob_M',
        'lightning_prob_N'
    ]

    # prepare final dataset
    X = df[['timestamp_15mins', 'airport_id'] + weather_features]
    y = df['arrival_count']

    # create output dir
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    # combine features and labels into dataframe
    data = pd.concat([X, pd.Series(y, name='arrival_count')], axis=1)

    # handle missing values
    data = handle_missing_values(data)

    # save processed data
    data.to_csv(output_path / output_file, index=False)
    print(f"\nprocessed data:")
    print(f"shape: {data.shape}")
    print(f"columns: {list(data.columns)}")
    print(f"\ndata saved to: {output_path / output_file}")

train_path = "drive/MyDrive/Colab Notebooks/dataset/FUSER/train"
test_path = "drive/MyDrive/Colab Notebooks/dataset/FUSER/test"
prepare_dataset(data_path=train_path, output_dir='processed_data', output_file='processed_train.csv')
prepare_dataset(data_path=test_path, output_dir='processed_data', output_file='processed_test.csv')


loading data...


ValueError: No objects to concatenate

In [64]:
import torch
import torch.nn as nn

class AirportNet(nn.Module):
    def __init__(self, input_dim):
        super(AirportNet, self).__init__()

        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.2),

            nn.Linear(256, 128),
            nn.ReLU(),
            #nn.BatchNorm1d(128),
            #nn.Dropout(0.1),

            nn.Linear(128, 64),
            nn.ReLU(),
            #nn.BatchNorm1d(64),
            #nn.Dropout(0.1),

            nn.Linear(64, 32),
            nn.ReLU(),
            #nn.BatchNorm1d(32),
            #nn.Dropout(0.1),

            nn.Linear(32, 1)
        )

        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.kaiming_normal_(m.weight)
            nn.init.constant_(m.bias, 0)

    def forward(self, x):
        return self.model(x).squeeze()


In [33]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split
import numpy as np
from sklearn.preprocessing import StandardScaler

class AirportDataset(Dataset):
    def __init__(self, data_path, feature_scaler=None, target_scaler=None, is_train=True):
        # read data with dtype specification
        dtype_dict = {
            # 'precip': 'float64',
            'timestamp_15mins': str,
            'airport_id': str,
            'arrival_count': 'float64'
        }
        self.df = pd.read_csv(data_path, dtype=dtype_dict)

        self.df['precip'] = pd.to_numeric(self.df['precip'], errors='coerce')
        self.df['precip'] = self.df['precip'].fillna(0)

        # separate features and target
        feature_cols = [col for col in self.df.columns if col not in ['timestamp_15mins', 'airport_id', 'arrival_count']]

        # convert bool to int
        for col in feature_cols:
            if self.df[col].dtype == bool:
                self.df[col] = self.df[col].astype(int)
            elif self.df[col].dtype == object:
                self.df[col] = pd.to_numeric(self.df[col], errors='coerce')

        # ensure precip column is properly handled
        self.df['precip'] = pd.to_numeric(self.df['precip'], errors='coerce')
        self.df['precip'] = self.df['precip'].fillna(0)

        # feature standardization
        if is_train:
            self.feature_scaler = StandardScaler()
            features_normalized = self.feature_scaler.fit_transform(self.df[feature_cols])
            self.target_scaler = StandardScaler()
            targets_normalized = self.target_scaler.fit_transform(self.df[['arrival_count']])
        else:
            self.feature_scaler = feature_scaler
            self.target_scaler = target_scaler
            features_normalized = self.feature_scaler.transform(self.df[feature_cols])
            targets_normalized = self.target_scaler.transform(self.df[['arrival_count']])

        # convert to tensor
        self.X = torch.FloatTensor(features_normalized)
        self.y = torch.FloatTensor(targets_normalized.squeeze())

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# create data loaders
def get_data_loaders(train_path, test_path, batch_size=64, num_workers=0, val_ratio=0.2):
    # create train set and get scalers
    full_train_dataset = AirportDataset(train_path, is_train=True)
    feature_scaler = full_train_dataset.feature_scaler
    target_scaler = full_train_dataset.target_scaler

    # create test set using train scalers
    test_dataset = AirportDataset(
        test_path,
        feature_scaler=feature_scaler,
        target_scaler=target_scaler,
        is_train=False
    )

    # calculate sizes for train and val sets
    val_size = int(len(full_train_dataset) * val_ratio)
    train_size = len(full_train_dataset) - val_size

    # split train set into train and val
    train_dataset, val_dataset = random_split(
        full_train_dataset,
        [train_size, val_size],
        generator=torch.Generator().manual_seed(42)
    )

    # create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers
    )

    print(f"\nDataset Info:")
    print(f"Training samples: {len(train_dataset)}")
    print(f"Validation samples: {len(val_dataset)}")
    print(f"Testing samples: {len(test_dataset)}")
    print(f"Feature dimension: {full_train_dataset.X.shape[1]}")
    return train_loader, val_loader, test_loader, target_scaler


In [65]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import time
from pathlib import Path

class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()

    def forward(self, yhat, y):
        return torch.sqrt(self.mse(yhat, y))

# train the model
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler,
                num_epochs=100, device='cuda', save_dir='checkpoints'):
    save_path = Path(save_dir)
    save_path.mkdir(parents=True, exist_ok=True)

    best_val_loss = float('inf')
    patience = 10  # early stopping patience
    no_improve = 0  # epochs without improvement

    for epoch in range(num_epochs):
        # training phase
        model.train()
        train_loss = 0
        start_time = time.time()

        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)

            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()

            # gradient clipping to prevent explosion
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()
            train_loss += loss.item()

        train_loss /= len(train_loader)

        # validation phase
        model.eval()
        val_loss = 0

        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                val_loss += loss.item()

        val_loss /= len(val_loader)

        # adjust learning rate
        scheduler.step(val_loss)

        # print training info
        epoch_time = time.time() - start_time
        print(f'Epoch [{epoch+1}/{num_epochs}] - {epoch_time:.2f}s')
        print(f'Train RMSE: {train_loss:.4f}, Val RMSE: {val_loss:.4f}')
        print(f'Learning Rate: {optimizer.param_groups[0]["lr"]:.6f}')

        # save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            no_improve = 0
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': val_loss,
            }, save_path / 'best_model.pth')
            print(f'Saved best model with val_loss: {val_loss:.4f}')
        else:
            no_improve += 1

        # early stopping check
        if no_improve >= patience:
            print(f'\nEarly stopping after {patience} epochs without improvement')
            break

        print()

# main training script
def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f'Using device: {device}')

    # load data
    train_path = "processed_data/processed_train.csv"
    test_path = "processed_data/processed_test.csv"
    train_loader, val_loader, _, _ = get_data_loaders(
        train_path,
        test_path,
        batch_size=512,
        val_ratio=0.2
    )

    # create model
    input_dim = train_loader.dataset.dataset.X.shape[1]
    model = AirportNet(input_dim=input_dim).to(device)

    # setup loss and optimizer
    criterion = RMSELoss()
    optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)

    # add learning rate scheduler
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode='min',
        factor=0.1,
        patience=5,
        min_lr=0.00001,
        verbose=True
    )

    # train the model
    train_model(
        model,
        train_loader,
        val_loader,
        criterion,
        optimizer,
        scheduler,
        num_epochs=500,
        device=device
    )


main()

Using device: cuda


  self.df = pd.read_csv(data_path, dtype=dtype_dict)



Dataset Info:
Training samples: 216504
Validation samples: 54126
Testing samples: 85360
Feature dimension: 16




Epoch [1/500] - 3.15s
Train RMSE: 1.0020, Val RMSE: 0.9833
Learning Rate: 0.001000
Saved best model with val_loss: 0.9833

Epoch [2/500] - 2.94s
Train RMSE: 0.9941, Val RMSE: 0.9826
Learning Rate: 0.001000
Saved best model with val_loss: 0.9826

Epoch [3/500] - 2.92s
Train RMSE: 0.9916, Val RMSE: 0.9805
Learning Rate: 0.001000
Saved best model with val_loss: 0.9805

Epoch [4/500] - 3.12s
Train RMSE: 0.9892, Val RMSE: 0.9771
Learning Rate: 0.001000
Saved best model with val_loss: 0.9771

Epoch [5/500] - 2.97s
Train RMSE: 0.9873, Val RMSE: 0.9758
Learning Rate: 0.001000
Saved best model with val_loss: 0.9758

Epoch [6/500] - 2.93s
Train RMSE: 0.9856, Val RMSE: 0.9732
Learning Rate: 0.001000
Saved best model with val_loss: 0.9732

Epoch [7/500] - 2.92s
Train RMSE: 0.9829, Val RMSE: 0.9721
Learning Rate: 0.001000
Saved best model with val_loss: 0.9721

Epoch [8/500] - 3.12s
Train RMSE: 0.9815, Val RMSE: 0.9710
Learning Rate: 0.001000
Saved best model with val_loss: 0.9710

Epoch [9/500] - 

In [67]:
from pathlib import Path

# test model and generate submission format results
def test_model(model, test_loader, target_scaler, device='cuda'):
    model.eval()
    all_outputs = []
    all_targets = []
    timestamps = []
    airport_ids = []

    with torch.no_grad():
        for i, (batch_X, batch_y) in enumerate(test_loader):
            # get original data
            start_idx = i * test_loader.batch_size
            end_idx = min((i + 1) * test_loader.batch_size, len(test_loader.dataset))

            batch_timestamps = test_loader.dataset.df['timestamp_15mins'].values[start_idx:end_idx]
            batch_airports = test_loader.dataset.df['airport_id'].values[start_idx:end_idx]

            # get predictions (normalized space)
            batch_X = batch_X.to(device)
            outputs = model(batch_X)

            # collect predictions and targets in normalized space
            all_outputs.append(outputs.cpu())
            all_targets.append(batch_y)

            # save timestamps and airport IDs
            timestamps.extend(batch_timestamps)
            airport_ids.extend(batch_airports)

    # calculate RMSE
    all_outputs = torch.cat(all_outputs, dim=0).numpy()
    all_targets = torch.cat(all_targets, dim=0).numpy()

    normalized_mse = np.mean((all_outputs - all_targets) ** 2)
    normalized_rmse = np.sqrt(normalized_mse)

    print(f"\nResults:")
    print(f"RMSE: {normalized_rmse:.4f}")

    # convert predictions to original space for submission
    predictions = target_scaler.inverse_transform(all_outputs.reshape(-1, 1))
    predictions = predictions.squeeze()

    # ensure all arrays have same length
    assert len(predictions) == len(timestamps) == len(airport_ids), \
        f"Length mismatch: predictions({len(predictions)}), timestamps({len(timestamps)}), airport_ids({len(airport_ids)})"

    # create submission format dataframe
    results_df = pd.DataFrame({
        'timestamp_15mins': timestamps,
        'airport_id': airport_ids,
        'prediction': predictions
    })

    # create id column
    results_df['ID'] = results_df.apply(
        lambda row: f"{row['airport_id']}_{pd.to_datetime(row['timestamp_15mins']).strftime('%y%m%d_%H%M')}_15",
        axis=1
    )

    # prep final submission format
    submission_df = pd.DataFrame({
        'ID': results_df['ID'],
        'Value': results_df['prediction'].round().clip(0)
    })

    # save results
    submission_df.to_csv('result.csv', index=False)
    print("\nResults saved to result.csv")


def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f'Using device: {device}')

    train_path = "processed_data/processed_train.csv"
    test_path = "processed_data/processed_test.csv"
    _, _, test_loader, target_scaler = get_data_loaders(train_path, test_path, batch_size=64)

    checkpoint_path = Path('checkpoints/best_model.pth')

    input_dim = test_loader.dataset.X.shape[1]
    model = AirportNet(input_dim=input_dim).to(device)

    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"Loaded model from epoch {checkpoint['epoch']} with validation loss: {checkpoint['val_loss']:.4f}")

    test_model(model, test_loader, target_scaler, device)


main()

Using device: cuda


  self.df = pd.read_csv(data_path, dtype=dtype_dict)



Dataset Info:
Training samples: 216504
Validation samples: 54126
Testing samples: 85360
Feature dimension: 16
Loaded model from epoch 92 with validation loss: 0.9383


  checkpoint = torch.load(checkpoint_path)



Results:
RMSE: 0.9679

Results saved to result.csv
