# LSTM-Model Basic

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl
from torch.utils.data import Dataset
from torchmetrics import Accuracy
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from lstm_utils.sequence_model import SequenceModel
from lstm_utils.species_dataset import SpeciesDataset
from lstm_utils.data_loader import DataLoader

In [2]:
pl.seed_everything(42)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


Seed set to 42


Using device: cpu


In [3]:
def get_ids_with_most_common_sample_count(df, id_column="id"):
    counts = df[id_column].value_counts()
    most_common_count = counts.value_counts().idxmax()
    matching_ids = counts[counts == most_common_count].index.tolist()
    print(f"Most common sample count: {most_common_count}")
    return matching_ids


def df_to_sequences(df):
    sequences = []
    for id_val, group in df.groupby("id"):
        X = group[feature_columns]
        y = group[label_column].iloc[0]  
        sequences.append((X, y))
    return sequences


In [4]:
train_path = "../../../data/baseline_training/trainset.csv"
val_path = "../../../data/baseline_training/valset.csv"
test_path = "../../../data/baseline_training/testset.csv"



loader = DataLoader()
train_df = loader.load_transform(train_path)
val_df = loader.load_transform(val_path)
test_df = loader.load_transform(test_path)

train_ids = set(get_ids_with_most_common_sample_count(train_df))
val_ids   = set(get_ids_with_most_common_sample_count(val_df))
test_ids  = set(get_ids_with_most_common_sample_count(test_df))



train_df = train_df[train_df["id"].isin(train_ids)].copy()
val_df   = val_df[val_df["id"].isin(val_ids)].copy()
test_df  = test_df[test_df["id"].isin(test_ids)].copy()

print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

Most common sample count: 166
Most common sample count: 164
Most common sample count: 154
(46812, 15)
(8528, 15)
(14168, 15)


In [5]:
feature_columns = [
    col
    for col in train_df.columns
    if col not in ["id", "time", "species", "disturbance_year"]
]
label_column = "species"

le = LabelEncoder()
le.fit(train_df[label_column])             
train_df[label_column] = le.transform(train_df[label_column])
val_df[label_column]   = le.transform(val_df[label_column])
test_df[label_column]  = le.transform(test_df[label_column])

n_classes = len(le.classes_)


train_sequences = df_to_sequences(train_df)
val_sequences = df_to_sequences(val_df)
test_sequences = df_to_sequences(test_df)
train_sequences[0]

(        doy     b2     b3     b4     b5      b6      b7      b8     b8a  \
 6687   29.0   92.0  213.0  143.0  344.0   977.0  1210.0  1330.0  1420.0   
 6688   49.0  206.0  233.0  188.0  378.0   955.0  1168.0  1392.0  1478.0   
 6689   74.0  195.0  249.0  229.0  474.0  1097.0  1391.0  1529.0  1579.0   
 6690   89.0  135.0  196.0  151.0  347.0   980.0  1200.0  1434.0  1505.0   
 6691  150.0  120.0  309.0  145.0  509.0  1730.0  2080.0  2084.0  2288.0   
 ...     ...    ...    ...    ...    ...     ...     ...     ...     ...   
 6848  227.0  142.0  269.0  178.0  483.0  1630.0  2269.0  2340.0  2620.0   
 6849  245.0  253.0  361.0  269.0  473.0  1515.0  1936.0  2015.0  2177.0   
 6850  272.0  158.0  233.0  168.0  426.0  1555.0  1942.0  2053.0  2311.0   
 6851  290.0  236.0  366.0  257.0  547.0  1430.0  1768.0  2155.0  2223.0   
 6852  302.0  371.0  549.0  431.0  730.0  1451.0  1712.0  2010.0  2122.0   
 
          b11    b12  
 6687   531.0  306.0  
 6688   637.0  284.0  
 6689   864.0  46

In [6]:
class SpeciesDataModule(pl.LightningDataModule):
    def __init__(self, train_sequences, val_sequences, test_sequences, batch_size=64):
        super().__init__()
        self.train_sequences = train_sequences
        self.val_sequences = val_sequences
        self.test_sequences = test_sequences
        self.batch_size = batch_size

    def setup(self, stage=None):
        self.train_dataset = SpeciesDataset(self.train_sequences)
        self.val_dataset = SpeciesDataset(self.val_sequences)
        self.test_dataset = SpeciesDataset(self.test_sequences)

    def train_dataloader(self):
        return torch.utils.data.DataLoader(
            self.train_dataset, batch_size=self.batch_size, shuffle=True,
            num_workers=4, persistent_workers=True
        )

    def val_dataloader(self):
        return torch.utils.data.DataLoader(
            self.val_dataset, batch_size=self.batch_size, shuffle=False,
            num_workers=4, persistent_workers=True
        )

    def test_dataloader(self):
        return torch.utils.data.DataLoader(
            self.test_dataset, batch_size=self.batch_size, shuffle=False,
            num_workers=4, persistent_workers=True
        )

In [7]:
class SpeciesPredictor(pl.LightningModule):
    def __init__(self, n_features, n_classes, lr=1e-4):
        super().__init__()
        self.save_hyperparameters()
        self.model = SequenceModel(n_features, n_classes)
        self.criterion = nn.CrossEntropyLoss()
        self.lr = lr
        self.train_acc = Accuracy(task="multiclass", num_classes=n_classes)
        self.val_acc = Accuracy(task="multiclass", num_classes=n_classes)
        self.test_acc = Accuracy(task="multiclass", num_classes=n_classes)

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]
        outputs = self(sequences)
        loss = self.criterion(outputs, labels)
        preds = torch.argmax(outputs, dim=1)
        acc = self.train_acc(preds, labels)
        self.log("train_loss", loss, prog_bar=True)
        self.log("train_acc", acc, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]
        outputs = self(sequences)
        loss = self.criterion(outputs, labels)
        preds = torch.argmax(outputs, dim=1)
        acc = self.val_acc(preds, labels)
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]
        outputs = self(sequences)
        loss = self.criterion(outputs, labels)
        preds = torch.argmax(outputs, dim=1)
        acc = self.test_acc(preds, labels)
        self.log("test_loss", loss, prog_bar=True)
        self.log("test_acc", acc, prog_bar=True)

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=self.lr)


In [8]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs

In [9]:

n_features = len(feature_columns)
n_classes = len(set(train_df[label_column]))
batch_size = 2
lr = 1e-3
max_epochs = 2

# DataModule & Model
data_module = SpeciesDataModule(
    train_sequences, val_sequences, test_sequences, batch_size=batch_size
)
model = SpeciesPredictor(n_features=n_features, n_classes=n_classes, lr=lr)

# Trainer
trainer = pl.Trainer(
    max_epochs=max_epochs,
    accelerator="gpu" if device=="cuda" else "cpu",
    devices=1,  
    log_every_n_steps=10,
    deterministic=True,

)

trainer.fit(model, datamodule=data_module)


💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
2025-10-09 09:53:50.379211: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

  | Name      | Type               | Params | Mode 
---------------------------------------------------------
0 | model     | SequenceModel      | 277 K  | train
1 | criterion | CrossEntropyLoss   | 0      | train
2 | train_acc | MulticlassAccuracy | 0      | train
3 | val_acc   | MulticlassAccuracy | 0      | train
4 | test_acc  | MulticlassAccuracy | 0      | train
-

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=2` reached.


In [10]:
from sklearn.metrics import accuracy_score
from collections import Counter
import torch


def weighted_accuracy(y_true, y_pred):
    class_counts = Counter(y_true)
    sample_weights = [
        1.0 / class_counts[y] for y in y_true
    ]  
    return accuracy_score(y_true, y_pred, sample_weight=sample_weights)



def get_predictions(model, dataloader, device="cpu"):
    model.eval()
    predictions = []
    labels = []
    with torch.no_grad():
        for batch in dataloader:
            seq_batch = batch["sequence"].to(device)
            label_batch = batch["label"].to(device)
            outputs = model(seq_batch)
            preds = torch.argmax(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            labels.extend(label_batch.cpu().tolist())
    return labels, predictions


# Train
train_labels, train_preds = get_predictions(
    model, data_module.train_dataloader(), device=device
)
train_weighted_acc = weighted_accuracy(train_labels, train_preds)

# Validation
val_labels, val_preds = get_predictions(
    model, data_module.val_dataloader(), device=device
)
val_weighted_acc = weighted_accuracy(val_labels, val_preds)

# Test
test_labels, test_preds = get_predictions(
    model, data_module.test_dataloader(), device=device
)
test_weighted_acc = weighted_accuracy(test_labels, test_preds)

print(f"Weighted Accuracy (Train): {train_weighted_acc:.4f}")
print(f"Weighted Accuracy (Validation): {val_weighted_acc:.4f}")
print(f"Weighted Accuracy (Test): {test_weighted_acc:.4f}")


Weighted Accuracy (Train): 0.1429
Weighted Accuracy (Validation): 0.1429
Weighted Accuracy (Test): 0.1429
