In [4]:
from configs.model import InputsConfig
from modeling.category_single import POIDataset
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from sklearn.model_selection import StratifiedKFold
from configs.globals import DEVICE


# --- your pre‐processing steps ---
df = pd.read_csv('/Users/vitor/Desktop/mestrado/ingred/data/output/florida_test/pre-processing/next-input.csv')
df['y'] = df['next_category'].astype('category').cat.codes
df.drop(columns=['userid', 'next_category'], inplace=True)
feature_cols = df.columns[0:InputsConfig.EMBEDDING_DIM * 9]
df['x'] = df[feature_cols].values.tolist()


# prepare raw lists
X = df['x'].tolist()
y = df['y'].tolist()

# --- create 5‐fold splits ---
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

folds = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y), 1):
    # slice out this fold’s train/val
    X_train = [X[i] for i in train_idx]
    y_train = [y[i] for i in train_idx]
    X_val = [X[i] for i in val_idx]
    y_val = [y[i] for i in val_idx]

    # convert to numpy arrays
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_val = np.array(X_val)
    y_val = np.array(y_val)

    X_train = X_train.reshape(X_train.shape[0], 9, InputsConfig.EMBEDDING_DIM)
    X_val = X_val.reshape(X_val.shape[0], 9, InputsConfig.EMBEDDING_DIM)

    # build Datasets
    train_ds = POIDataset(X_train, y_train)
    val_ds = POIDataset(X_val, y_val)

    # and DataLoaders
    train_loader = DataLoader(train_ds, batch_size=512, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_ds, batch_size=512, shuffle=False, num_workers=0)

    folds.append((train_loader, val_loader))

    print(f"Fold {fold}: train={len(train_ds)}  val={len(val_ds)}")

Fold 1: train=13093  val=3274
Fold 2: train=13093  val=3274
Fold 3: train=13094  val=3273
Fold 4: train=13094  val=3273
Fold 5: train=13094  val=3273


In [8]:
PAD_VALUE = -999

In [17]:
import json



import torch.nn as nn
from sklearn.metrics import classification_report
import torch.nn.functional as F



class WindowClassifierWithTransformer(nn.Module):
    def __init__(self, input_dim=100, hidden_dim=64, num_classes=7,
                 num_layers=2, nhead=8, dropout=0.1, num_windows=9):
        super(WindowClassifierWithTransformer, self).__init__()

        self.embedding = nn.Linear(input_dim, hidden_dim)

        self.positional_encoding = nn.Parameter(torch.randn(1, num_windows, hidden_dim))

        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=nhead, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.classifier = nn.Linear(hidden_dim, num_classes)

        self.reduction_layer = nn.Linear(num_windows * num_classes, num_classes)

    def forward(self, x):
        x = self.embedding(x)

        x = x + self.positional_encoding

        x = x.transpose(0, 1)

        x = self.transformer_encoder(x)

        x = x.transpose(0, 1)

        logits = self.classifier(x)

        flattened = logits.reshape(logits.size(0), logits.size(1) * logits.size(2))

        final_logits = self.reduction_layer(flattened)  # [batch_size, num_classes]

        final_probs = F.softmax(final_logits, dim=-1)  # [batch_size, num_classes]

        return final_probs

In [4]:

import torch.nn as nn


class RepeatedTransformerClassifier(nn.Module):
    def __init__(self,
                 input_dim=107,
                 d_model=128,
                 nhead=8,
                 num_layers=6,  # <-- number of times to repeat
                 num_classes=7,
                 dropout=0.1):
        super().__init__()
        # 1) lift input to model dimension
        self.input_proj = nn.Linear(input_dim, d_model)
        # 2) positional encoding
        self.pos_enc = PositionalEncoding(d_model, max_len=9)

        # 3) build your own stack of layers
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=d_model,
                nhead=nhead,
                dim_feedforward=d_model * 4,
                dropout=dropout,
                norm_first=True      # ← switch to pre-norm
            )
            for _ in range(num_layers)
        ])

        # 4) final classification head
        self.classifier = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, num_classes)
        )

    def forward(self, x):
        """
        x: (batch, seq_len=9, input_dim=107)
        returns: (batch, num_classes=7)
        """
        # project → (batch, 9, d_model)
        x = self.input_proj(x)
        # to (seq, batch, d_model)
        x = x.permute(1, 0, 2)
        # add positional info
        x = self.pos_enc(x)

        # now *repeat* self-attention+MLP num_layers times
        for layer in self.layers:
            x = layer(x)  # → still (9, batch, d_model)

        # take last step’s vector: shape (batch, d_model)
        last_vec = x.mean(dim=0)
        # classify → (batch, 7)
        logits = self.classifier(last_vec)
        return logits


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float()
                             * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(1)  # (max_len, 1, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: (seq_len, batch, d_model)
        return x + self.pe[:x.size(0)]

In [14]:
import math
import torch
import torch.nn as nn


class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_seq_length=5000, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_seq_length, embed_dim)
        pos = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div = torch.exp(torch.arange(0, embed_dim, 2).float() *
                        (-math.log(10000.0) / embed_dim))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        pe = pe.unsqueeze(0)           # (1, max_seq_length, embed_dim)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: (batch_size, seq_len, embed_dim)
        seq_len = x.size(1)
        x = x + self.pe[:, :seq_len]
        return self.dropout(x)


class NextHead(nn.Module):
    def __init__(self,
                 embed_dim: int,
                 num_classes: int,
                 num_heads: int,
                 seq_length: int,
                 num_layers: int,
                 dropout: float = 0.35):
        super().__init__()
        self.embed_dim = embed_dim
        self.seq_length = seq_length
        self.pad_value = PAD_VALUE

        # positional encoding for up to seq_length
        self.pe = PositionalEncoding(embed_dim, max_seq_length=seq_length)

        # a single TransformerEncoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=embed_dim * 4,
            dropout=dropout,
            batch_first=True,
            norm_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)

        # project each position to a single score (we’ll do softmax in forward)
        self.score_proj = nn.Linear(embed_dim, 1)

        # final classification
        self.classifier = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        """
        x: (batch_size, seq_length, embed_dim),
           padded positions are exactly all elements == self.pad_value
        """
        # 1) build padding mask
        #    mask[b, i] = True  =>  position i in batch b is padding
        pad_mask = (x == self.pad_value).all(dim=-1)  # (batch, seq_length)

        # 2) add positional encodings
        x = self.pe(x)  # (batch, seq_length, embed_dim)

        # 3) build a boolean causal mask of shape (seq_length, seq_length)
        #    True indicates positions that should NOT attend (i.e. upper triangular)
        causal_mask = torch.triu(
            torch.ones((self.seq_length, self.seq_length),
                       dtype=torch.bool,
                       device=x.device),
            diagonal=1
        )
        x = self.transformer(
            x,
            mask=causal_mask,                # now dtype=bool
            src_key_padding_mask=pad_mask    # also dtype=bool
        )

        # 4) compute a weight for each position
        #    then mask out padding before softmax
        scores = self.score_proj(x).squeeze(-1)       # (batch, seq_length)
        scores = scores.masked_fill(pad_mask, float('-inf'))
        attn_weights = torch.softmax(scores, dim=1)   # (batch, seq_length)

        # 5) weighted sum over sequence
        #    (batch, embed_dim)
        pooled = torch.einsum('bs,bse->be', attn_weights, x)

        # 6) final classification
        out = self.classifier(pooled)                 # (batch, num_classes)
        return out

In [5]:
import math
import torch.nn as nn

class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_seq_length=5000, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Create constant positional encoding matrix
        pe = torch.zeros(max_seq_length, embed_dim)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        if embed_dim % 2 == 1:
            pe[:, 1::2] = torch.cos(position * div_term[:-1])
        else:
            pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):  # x: (batch_size, seq_length, embed_dim)
        x = x + self.pe[:, : x.size(1)]
        return self.dropout(x)


class NextHead(nn.Module):
    def __init__(
        self,
        embed_dim=64,
        num_classes=7,
        num_heads=4,
        max_seq_length=9,
        num_layers=2,
        dropout=0.1,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.max_seq_length = max_seq_length

        # Make transformer hidden size divisible by num_heads
        self.transformer_dim = num_heads * math.ceil(embed_dim / num_heads)

        # Project input embeddings to transformer dimension
        self.input_proj = nn.Linear(embed_dim, self.transformer_dim)

        # Positional encoding
        self.pos_enc = PositionalEncoding(self.transformer_dim, max_seq_length, dropout)

        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=self.transformer_dim,
            nhead=num_heads,
            dim_feedforward=self.transformer_dim * 4,
            dropout=dropout,
            batch_first=True,
            norm_first=True,
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)

        # Attention pooling
        self.attn_fc = nn.Linear(self.transformer_dim, 1)

        # Final classifier
        self.output_fc = nn.Linear(self.transformer_dim, num_classes)

    def forward(self, x):
        # x: (batch_size, seq_length, embed_dim)
        batch_size, seq_length, _ = x.size()

        # Create padding mask: True for positions that are all PAD_VALUE
        pad_mask = (x == PAD_VALUE).all(dim=-1)  # (batch_size, seq_length)

        # Project and add positional encoding
        x_proj = self.input_proj(x)
        x_proj = self.pos_enc(x_proj)

        # Causal mask to prevent attending to future positions
        causal_mask = torch.triu(
            torch.full((seq_length, seq_length), float('-inf'), device=x.device),
            diagonal=1,
        )

        # Apply transformer encoder
        enc_out = self.transformer(
            x_proj,
            mask=causal_mask,
            src_key_padding_mask=pad_mask,
        )  # (batch_size, seq_length, transformer_dim)

        # Compute attention logits and mask padding
        attn_logits = self.attn_fc(enc_out).squeeze(-1)  # (batch_size, seq_length)
        attn_logits = attn_logits.masked_fill(pad_mask, float('-inf'))

        # Normalize to get weights
        attn_weights = torch.softmax(attn_logits, dim=-1)  # (batch_size, seq_length)

        # Weighted sum of sequence outputs
        summary = torch.bmm(attn_weights.unsqueeze(1), enc_out).squeeze(1)  # (batch_size, transformer_dim)

        # Final classification
        out = self.output_fc(summary)
        return out

In [28]:
import torch
import torch.nn as nn

class POICategoryPredictor(nn.Module):
    def __init__(
        self,
        input_size: int = 107,   # dimensionality of each POI embedding
        hidden_size: int = 128,  # number of features in the LSTM hidden state
        num_layers: int = 2,     # number of stacked LSTM layers
        num_classes: int = 7,    # number of categories to predict
        dropout: float = 0.3     # dropout between LSTM layers
    ):
        super().__init__()
        # batch_first=True so input / output tensors are (batch, seq, feature)
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0,
        )
        # if you want to add more capacity you can insert extra Linear+ReLU here
        self.classifier = nn.Linear(hidden_size, num_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: Tensor of shape (batch_size, seq_len=9, input_size=107)
        returns: logits of shape (batch_size, num_classes)
        """
        # LSTM returns output for all time steps and the final hidden state
        output, (h_n, c_n) = self.lstm(x)
        # h_n is (num_layers, batch, hidden_size); we want the last layer
        last_hidden = h_n[-1]             # shape: (batch_size, hidden_size)
        logits = self.classifier(last_hidden)
        return logits

In [12]:
import torch
import torch.nn as nn


class POISequencePredictor(nn.Module):
    def __init__(self, input_dim=107, hidden_dim=128, num_heads=4, num_layers=2, num_classes=7, dropout=0.1):
        super(POISequencePredictor, self).__init__()

        # Embedding dimension is already provided (107)
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_classes = num_classes

        # Positional encoding for the sequence
        self.position_embedding = nn.Parameter(torch.zeros(1, 9, hidden_dim))

        # Project input embeddings to the hidden dimension
        self.input_projection = nn.Linear(input_dim, hidden_dim)

        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim,
            nhead=num_heads,
            dim_feedforward=hidden_dim * 4,
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Final prediction layer
        self.classifier = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        # x shape: (batch_size, seq_length=9, embedding_dim=107)
        batch_size = x.size(0)

        # Project input to hidden dimension
        x = self.input_projection(x)  # shape: (batch_size, seq_length, hidden_dim)

        # Add positional encoding
        x = x + self.position_embedding

        # Pass through transformer
        x = self.transformer_encoder(x)  # shape: (batch_size, seq_length, hidden_dim)

        # Get the last position's output
        last_hidden = x[:, -1, :]  # shape: (batch_size, hidden_dim)

        # Predict next category
        logits = self.classifier(last_hidden)  # shape: (batch_size, num_classes)

        return logits

In [18]:
from collections import defaultdict
from tqdm.notebook import tqdm

for fold_idx, (train_loader, val_loader) in enumerate(folds):
    fold_metrics = defaultdict(list)

    # model = POISequencePredictor(
    #     input_dim=InputsConfig.EMBEDDING_DIM,
    #     hidden_dim=128,
    #     num_heads=4,
    #     num_layers=2,
    #     num_classes=7,
    #     dropout=0.4
    # )
    # model = NextHead(
    #     embed_dim=InputsConfig.EMBEDDING_DIM,
    #     num_classes=7,
    #     num_heads=4,
    #     seq_length=9,
    #     num_layers=2,
    #     dropout=0.35
    # )
    model = WindowClassifierWithTransformer(
        input_dim=InputsConfig.EMBEDDING_DIM,
        hidden_dim=128,
        num_classes=7,
        num_layers=2,
        nhead=4,
        dropout=0.1,
        num_windows=9
    )
    model.to(DEVICE)

    criterion = nn.CrossEntropyLoss()

    # Enhanced optimizer setup
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=5e-5,
        eps=1e-3,  # Helps prevent division by zero
        weight_decay=0.01  # Slightly stronger regularization
    )

    # Transformer-specific learning rate scheduler
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=2e-4,  # Peak learning rate
        total_steps=200 * len(train_loader),  # Total number of training steps
        pct_start=0.1,  # Percentage of steps for warmup
        div_factor=25,  # Initial lr = max_lr/div_factor
        final_div_factor=10000,  # Final lr = max_lr/(div_factor*final_div_factor)
        anneal_strategy='cos'  # Cosine annealing
    )

    best_val_acc = 0.0
    epoch_progress = tqdm(range(200), desc=f"Fold {fold_idx}")

    for epoch in epoch_progress:
        model.train()
        train_loss = 0.0
        train_acc = 0.0
        train_total = 0

        for X_batch, y_batch in train_loader:
            x_next = X_batch.to(DEVICE, non_blocking=True)
            y_next = y_batch.to(DEVICE, non_blocking=True)

            optimizer.zero_grad()
            out_a = model(x_next)
            loss = criterion(out_a, y_next)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            _, predicted = torch.max(out_a, 1)
            correct = (predicted == y_next).sum().item()
            total = y_next.size(0)

            train_loss += loss.item() * total
            train_acc += correct
            train_total += total

        epoch_train_loss = train_loss / train_total
        epoch_train_acc = train_acc / train_total

        model.eval()
        val_loss = 0.0
        val_acc = 0.0
        val_total = 0

        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                x_next = X_batch.to(DEVICE, non_blocking=True)
                y_next = y_batch.to(DEVICE, non_blocking=True)

                out_a = model(x_next)
                loss = criterion(out_a, y_next)

                _, predicted = torch.max(out_a, 1)
                correct = (predicted == y_next).sum().item()
                total = y_next.size(0)

                val_loss += loss.item() * total
                val_acc += correct
                val_total += total

        epoch_val_loss = val_loss / val_total
        epoch_val_acc = val_acc / val_total

        fold_metrics['train_loss'].append(epoch_train_loss)
        fold_metrics['train_acc'].append(epoch_train_acc)
        fold_metrics['val_loss'].append(epoch_val_loss)
        fold_metrics['val_acc'].append(epoch_val_acc)

        epoch_progress.set_postfix({
            'tr_loss': f"{epoch_train_loss:.4f}",
            'tr_acc': f"{epoch_train_acc:.4f}",
            'vl_loss': f"{epoch_val_loss:.4f}",
            'vl_acc': f"{epoch_val_acc:.4f}"
        })

    model.eval()

    with torch.no_grad():
        predicted = []
        ground_truth = []
        for X_batch, y_batch in val_loader:
            x_next = X_batch.to(DEVICE, non_blocking=True)
            y_next = y_batch.to(DEVICE, non_blocking=True)

            out_a = model(x_next)

            _, pred = torch.max(out_a, 1)
            predicted.append(pred.cpu().numpy())
            ground_truth.append(y_next.cpu().numpy())

        report = classification_report(
            np.concatenate(ground_truth),
            np.concatenate(predicted),
            output_dict=True,
            zero_division=0
        )
        print(json.dumps(report, indent=4))

    print(f"Fold {fold_idx} - Best Val Acc: {best_val_acc:.4f}")



Fold 0:   0%|          | 0/200 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
output.shape

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F


# Cross-Stitch Unit
class CrossStitchUnit(nn.Module):
    def __init__(self):
        super(CrossStitchUnit, self).__init__()
        # Learnable alpha parameters (initialized to identity)
        self.alpha = nn.Parameter(torch.tensor([[0.9, 0.1], [0.1, 0.9]], requires_grad=True))

    def forward(self, a, b):
        # a and b are features from each task branch
        a_out = self.alpha[0, 0] * a + self.alpha[0, 1] * b
        b_out = self.alpha[1, 0] * a + self.alpha[1, 1] * b
        return a_out, b_out


# A simple convolutional block
class ConvBlock(nn.Module):
    def __init__(self):
        super(ConvBlock, self).__init__()
        self.conv = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2)

    def forward(self, x):
        return self.pool(F.relu(self.conv(x)))


# Main Multi-Task Network with Cross-Stitch
class CrossStitchNet(nn.Module):
    def __init__(self):
        super(CrossStitchNet, self).__init__()
        # Task-specific initial blocks
        self.taskA_conv1 = ConvBlock()
        self.taskB_conv1 = ConvBlock()

        # Cross-stitch unit after first conv layer
        self.cross_stitch = CrossStitchUnit()

        # Shared second conv block
        self.taskA_conv2 = ConvBlock()
        self.taskB_conv2 = ConvBlock()

        # Task-specific heads
        self.taskA_fc = nn.Linear(16 * 7 * 7, 10)  # For classification
        self.taskB_fc = nn.Linear(16 * 7 * 7, 1)  # For regression

    def forward(self, x):
        a = self.taskA_conv1(x)
        b = self.taskB_conv1(x)

        # Cross-stitch blending
        a, b = self.cross_stitch(a, b)

        # Continue task-specific paths
        a = self.taskA_conv2(a)
        b = self.taskB_conv2(b)

        # Flatten
        a = a.view(a.size(0), -1)
        b = b.view(b.size(0), -1)

        # Final heads
        outA = self.taskA_fc(a)
        outB = self.taskB_fc(b)
        return outA, outB

In [None]:
x = torch.randn(32, 1, 28, 28)  # Example input

model = CrossStitchNet()
output = model(x)

In [None]:
output.shape