In [None]:
import os
import torch
import math
import torch
import pandas as pd
import numpy as np
import torch.nn as nn 
import torch.nn.functional as F
import matplotlib.pyplot as plt 
from torch import optim
from tqdm import tqdm
import logging
from torch.utils.tensorboard import SummaryWriter

In [None]:
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", level=logging.INFO, datefmt="%I:%M:%S")
def setup_logging(run_name):
    os.makedirs("models", exist_ok=True)
    os.makedirs("results", exist_ok=True)
    os.makedirs(os.path.join("models", run_name), exist_ok=True)
    os.makedirs(os.path.join("results", run_name), exist_ok=True)

# Data Preprocessing

In [None]:
# Fred's encoder code
class DescreteEncoder:
    def __init__(self, duration: int = 1440, step_size: int = 10):
        self.duration = duration
        self.step_size = step_size
        self.steps = duration // step_size
        self.index_to_acts = {}
        self.acts_to_index = {}

    def encode(self, data: pd.DataFrame):
        # Create mappings from activity to index and vice versa
        self.index_to_acts = {i: a for i, a in enumerate(data.act.unique())}
        self.acts_to_index = {a: i for i, a in self.index_to_acts.items()}
        
        # Create a new DataFrame for encoded data
        encoded_data = data.copy()
        encoded_data['act'] = encoded_data['act'].map(self.acts_to_index)
        return encoded_data
    
    def decode(self, encoded_image_grid) -> pd.DataFrame:
        if isinstance(encoded_image_grid, torch.Tensor):
            encoded_image_grid = encoded_image_grid.numpy()
        
        decoded = []
        for pid in range(encoded_image_grid.shape[0]):
            sequence = encoded_image_grid[pid]
            current_act = None
            act_start = None
            
            for time_step, act_index in enumerate(sequence):
                # If the activity changes or it's the end of the day, record the activity
                if act_index != current_act and current_act is not None:
                    act_end = time_step * self.step_size
                    decoded.append({
                        "pid": pid,
                        "act": self.index_to_acts[current_act],
                        "start": act_start,
                        "end": act_end
                    })
                    act_start = time_step * self.step_size
                # If the activity changes, update the current activity
                if act_index != current_act:
                    current_act = act_index
                    act_start = time_step * self.step_size
            
            # Add the last activity of the day if the day ended with an activity
            if current_act is not None and act_start is not None:
                decoded.append({
                    "pid": pid,
                    "act": self.index_to_acts[current_act],
                    "start": act_start,
                    "end": self.duration
                })

        return pd.DataFrame(decoded, columns=["pid", "act", "start", "end"])

In [None]:
# Load data and encode it
data = pd.read_csv('./data/nts_population.csv')
encoder = DescreteEncoder()
encoded_data = encoder.encode(data)

# Function to convert encoded data into an image grid
def create_image_grid(encoded_data, encoder):
    # Map pid to sequential indices starting from 0
    pid_to_index = {pid: index for index, pid in enumerate(encoded_data['pid'].unique())}
    
    num_people = len(pid_to_index)
    time_steps = encoder.steps
    grid = np.zeros((num_people, time_steps))
    
    for _, row in encoded_data.iterrows():
        pid_index = pid_to_index[row['pid']]
        act_index = row['act']
        start_step = row['start'] // encoder.step_size
        end_step = row['end'] // encoder.step_size
        grid[pid_index, start_step:end_step] = act_index
    
    return grid

image_grid = create_image_grid(encoded_data, encoder)
tensor_image_grid = torch.tensor(image_grid)
torch.save(tensor_image_grid, './data/image_grid.pt')

In [None]:
# Inspect mappings
print("Index to Activities Mapping:")
print(encoder.index_to_acts)
print("\nActivities to Index Mapping:")
print(encoder.acts_to_index)

# Calculate vocabulary size
vocab_size = len(encoder.index_to_acts)
print("\nVocabulary Size:", vocab_size)

# Compare vocabulary size to expected size
expected_vocab_size = 8  # Assuming 'ntoken' is the expected vocabulary size
if vocab_size != expected_vocab_size:
    print(f"Warning: Vocabulary size ({vocab_size}) does not match expected size ({expected_vocab_size})")
else:
    print("Vocabulary size matches expected size")

In [None]:
def plot_image_grid(image_grid, title = 'Activity Sequence (Training)' ):    
    plt.figure(figsize=(12, 6))
    plt.imshow(image_grid, aspect='auto', cmap="tab10")
    plt.colorbar()
    plt.title(title)
    plt.xlabel('Time Steps')
    plt.ylabel('Individuals')
    plt.show()
plot_image_grid(image_grid)

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.optim as optim

class ActivityDataset(TensorDataset):
    def __init__(self, sequences, context_size=2):
        self.data = []
        for sequence in sequences:
            for i in range(context_size, len(sequence) - context_size):
                context = [sequence[i + j] for j in range(-context_size, context_size + 1) if j != 0]
                target = sequence[i]
                self.data.append((torch.tensor(context), torch.tensor(target)))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOWModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        inputs = inputs.to(device)  # Ensure inputs are on the correct device
        embedded = self.embeddings(inputs).mean(dim=1)
        out = self.linear(embedded)
        return out

def train_model(dataset, model, optimizer, epochs=10, device="cuda"):
    model.train()
    loss_function = nn.CrossEntropyLoss()
    for epoch in range(epochs):
        total_loss = 0
        for context, target in DataLoader(dataset, batch_size=128, shuffle=True):
            context, target = context.to(device), target.to(device)  # Ensure data is on the correct device
            model.zero_grad()
            log_probs = model(context)
            loss = loss_function(log_probs, target.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch}, Loss: {total_loss / len(dataset)}')

# Assuming tensor_image_grid is already defined and is a tensor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset = ActivityDataset(tensor_image_grid.long(), context_size=2)
CBOW = CBOWModel(vocab_size=8, embedding_dim=6).to(device)
optimizer = optim.Adam(CBOW.parameters(), lr=0.001)

# Train the model
train_model(dataset, CBOW, optimizer, epochs=10)

In [None]:
embedding_weights = CBOW.embeddings.weight.data

activities_to_idx = {
    'home': 0, 'shop': 1, 'work': 2, 'escort': 3, 'other': 4, 'education': 5, 'visit': 6, 'medical': 7
}

# Print the vector for each activity
for activity, idx in activities_to_idx.items():
    vector = embedding_weights[idx]
    print(f"Vector for {activity}: {vector}")

In [None]:
tensor_image_grid = tensor_image_grid.to(device)
tensor_image_grid = tensor_image_grid.long()

# Apply embeddings
embedded_sequences = CBOW.embeddings(tensor_image_grid)


dataset = TensorDataset(embedded_sequences)

# Initialize DataLoader
batch_size = 64  
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)  


for embedded_batch in dataloader:
    print(embedded_batch[0].shape)  #([batch_size, 144, embedding_dim])
    break


# Diffusion

In [None]:
class Diffusion:
    def __init__(self, noise_steps=25, beta_start=0.0001, beta_end=0.02, sequence_length=144, device="cuda"):
        self.noise_steps = noise_steps
        self.beta_start = beta_start
        self.beta_end = beta_end
        self.sequence_length = sequence_length
        self.device = device

        self.beta = self.schedule_noise().to(device)
        self.alpha = 1. - self.beta
        self.alpha_hat = torch.cumprod(self.alpha, dim=0)

    def schedule_noise(self):
        return torch.linspace(self.beta_start, self.beta_end, self.noise_steps)
    
    def noise_embeddings(self, embeddings, t):
        sqrt_alpha_hat = torch.sqrt(self.alpha_hat[t])[:, None, None]
        sqrt_one_minus_alpha_hat = torch.sqrt(1. - self.alpha_hat[t])[:, None, None]

        epsilon = torch.randn_like(embeddings)

        noisy_embeddings = sqrt_alpha_hat * embeddings + sqrt_one_minus_alpha_hat * epsilon

        # Debug: Check for NaNs
        if torch.isnan(noisy_embeddings).any():
            print("NaNs detected in noisy_embeddings")
            print("sqrt_alpha_hat:", sqrt_alpha_hat)
            print("sqrt_one_minus_alpha_hat:", sqrt_one_minus_alpha_hat)
            print("epsilon:", epsilon)

        return noisy_embeddings.float(), epsilon.float()

    def sample_timesteps(self, n):
        return torch.randint(low=1, high=self.noise_steps, size=(n,))

    def sample(self, model, n, embedding_dim):
        logging.info(f"Sampling {n} new sequences...")
        model.eval()
        with torch.no_grad():
            x = torch.randn((n, self.sequence_length, embedding_dim)).to(self.device)
            for i in tqdm(reversed(range(1, self.noise_steps)), position=0):
                t = (torch.ones(n) * i).long().to(self.device)
                predicted_noise = model(x, t)
                
                alpha = self.alpha[t][:, None, None]
                alpha_hat = self.alpha_hat[t][:, None, None]
                beta = self.beta[t][:, None, None]

                if i > 1:
                    noise = torch.randn_like(x)
                else:
                    noise = torch.zeros_like(x)

                x = (1 / torch.sqrt(alpha)) * (x - ((1 - alpha) / torch.sqrt(1 - alpha_hat)) * predicted_noise) + torch.sqrt(beta) * noise

        model.train()
        return x

![Diffusion Sampling Algorithm](sampling.png)


In [None]:
from models.transformer import TransformerWithPositionalEncoding

def setup_logging(run_name):
    logging.basicConfig(level=logging.INFO)

def train(model, diffusion, device):
    run_name = "first_run"
    device = "cuda" if torch.cuda.is_available() else "cpu"
    learning_rate = 0.002
    epochs = 150

    setup_logging(run_name)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    mse = nn.MSELoss() 
    logger = SummaryWriter(os.path.join("runs", run_name))

    for epoch in range(epochs):
        logging.info(f"Starting epoch {epoch}")
        epoch_loss = 0.0
        for batch_idx, (x,) in enumerate(tqdm(dataloader)):
            embedded_x = x.to(device) 

            optimizer.zero_grad()  
            timesteps = torch.randint(0, diffusion.noise_steps, (embedded_x.size(0),), device=device)
            embedded_x = embedded_x.float()
            noisy_x, _ = diffusion.noise_embeddings(embedded_x, timesteps)
            predicted_x = model(noisy_x, timesteps)

            loss = mse(predicted_x, noisy_x)
            loss.backward(retain_graph=True)
            optimizer.step()  
            epoch_loss += loss.item()

            # Log training loss to TensorBoard every 10 batches
            if batch_idx % 10 == 0:
                logger.add_scalar("Loss/train", loss.item(), epoch * len(dataloader) + batch_idx)

        avg_epoch_loss = epoch_loss / len(dataloader)
        logging.info(f"Epoch {epoch} Average Loss: {avg_epoch_loss}")
        logger.add_scalar("Loss/epoch_avg_train", avg_epoch_loss, epoch)

    logger.close()

device = "cuda" if torch.cuda.is_available() else "cpu"
model = TransformerWithPositionalEncoding(feature_size=6, num_layers=3, max_time_steps=144).to(device)
diffusion = Diffusion(noise_steps=25, beta_start=0.0001, beta_end=0.02, sequence_length=144, device="cuda")

train(model, diffusion, device=device)

In [None]:
def map_embeddings_to_activities(embedded_sequences, CBOW):

    embeddings = CBOW.embeddings.weight.data
    
    # Normalize embeddings to unit vectors
    embeddings = F.normalize(embeddings, p=2, dim=1)

    
    embedded_sequences = F.normalize(embedded_sequences, p=2, dim=-1)

    # Calculate dot products (shape: [n * sequence_length, NUM_ACTIVITIES])
    # Embedded_sequences are 3D ([n, sequence_length, embedding_dim]) and embeddings is 2D,
    # we need to reshape embedded_sequences appropriately
    n, sequence_length, embedding_dim = embedded_sequences.size()
    embedded_sequences_flat = embedded_sequences.view(-1, embedding_dim)  # Reshape for matmul
    dot_products = torch.matmul(embedded_sequences_flat, embeddings.T)
    
    # The resulting indices are the activities (shape: [n * sequence_length])
    _, activities = torch.max(dot_products, dim=1)

    # Reshape back to original sequence format ([n, sequence_length])
    activities = activities.view(n, sequence_length)
    
    return activities


In [None]:
# After generating new sequences using the diffusion model
generated_sequences = diffusion.sample(model = model, n=32, embedding_dim=6)

# Map the generated sequences back to discrete activities
discrete_activities = map_embeddings_to_activities(generated_sequences, CBOW)


In [None]:
# Decode each activity index back to its label
decoded_sequences = [[encoder.index_to_acts[index.item()] for index in sequence] for sequence in discrete_activities]


In [None]:
df_decoded = pd.DataFrame(decoded_sequences, columns=[f"Step {i}" for i in range(len(decoded_sequences[0]))])
df_decoded.head(5) 

In [None]:
plot_image_grid(discrete_activities.cpu(), title = 'Activity Sequence (Sampled)')