In [409]:
import os
import copy
from copy import deepcopy
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score

Transformation Function

In [410]:
def get_tta_transforms():
    # Define a transformation that randomly masks some features
    class RandomFeatureMasking:
        def __init__(self, p=0.1):
            self.p = p

        def __call__(self, x):
            # x is a tensor of shape (batch_size, num_features)
            # Generate a mask with probability p of masking each feature
            mask = (torch.rand_like(x.float()) > self.p).long()
            x_transformed = x * mask
            return x_transformed

    return RandomFeatureMasking(p=0.1)

CoTTA Utility Functions

In [411]:
def copy_model_and_optimizer(model, optimizer):
    """Copy the model and optimizer states for resetting after adaptation."""
    model_state = deepcopy(model.state_dict())
    model_anchor = deepcopy(model)
    optimizer_state = deepcopy(optimizer.state_dict())
    ema_model = deepcopy(model)
    for param in ema_model.parameters():
        param.detach_()
    return model_state, optimizer_state, ema_model, model_anchor

def load_model_and_optimizer(model, optimizer, model_state, optimizer_state):
    """Restore the model and optimizer states from copies."""
    model.load_state_dict(model_state, strict=True)
    optimizer.load_state_dict(optimizer_state)

def configure_model(model):
    """Configure model for use with CoTTA."""
    model.train()
    model.requires_grad_(False)
    for m in model.modules():
        if isinstance(m, (nn.Embedding, nn.BatchNorm1d)):
            m.requires_grad_(True)
            if isinstance(m, nn.BatchNorm1d):
                m.track_running_stats = False
                m.running_mean = None
                m.running_var = None
    return model

def collect_params(model):
    """Collect all trainable parameters."""
    params = []
    names = []
    for nm, m in model.named_modules():
        for np, p in m.named_parameters():
            if p.requires_grad:
                params.append(p)
                names.append(f"{nm}.{np}")
                print(f"Parameter to adapt: {nm}.{np}")
    return params, names

Custom Binary Entropy Function

In [412]:
def binary_entropy(logits, logits_ema):
    """Entropy for binary classification."""
    p = torch.sigmoid(logits)
    p_ema = torch.sigmoid(logits_ema)
    entropy = -0.5 * (p_ema * torch.log(p + 1e-8) + (1 - p_ema) * torch.log(1 - p + 1e-8))
    entropy -= 0.5 * (p * torch.log(p_ema + 1e-8) + (1 - p) * torch.log(1 - p_ema + 1e-8))
    return entropy.squeeze()

CoTTA Class

In [413]:
class CoTTA(nn.Module):
    """CoTTA adapts a model by entropy minimization during testing."""
    def __init__(self, model, optimizer, steps=1, episodic=False):
        super().__init__()
        self.model = model
        self.optimizer = optimizer
        self.steps = steps
        assert steps > 0, "CoTTA requires >= 1 step(s) to forward and update"
        self.episodic = episodic

        self.model_state, self.optimizer_state, self.model_ema, self.model_anchor = \
            copy_model_and_optimizer(self.model, self.optimizer)
        self.transform = get_tta_transforms()

    def forward(self, x):
        if self.episodic:
            self.reset()
        for _ in range(self.steps):
            outputs = self.forward_and_adapt(x, self.model, self.optimizer)
        return outputs

    def reset(self):
        if self.model_state is None or self.optimizer_state is None:
            raise Exception("Cannot reset without saved model/optimizer state")
        load_model_and_optimizer(self.model, self.optimizer,
                                 self.model_state, self.optimizer_state)
        self.model_state, self.optimizer_state, self.model_ema, self.model_anchor = \
            copy_model_and_optimizer(self.model, self.optimizer)

    @torch.enable_grad()
    def forward_and_adapt(self, x, model, optimizer):
        outputs = self.model(x)
        self.model_ema.train()
        # Teacher Prediction
        anchor_prob = torch.sigmoid(self.model_anchor(x)).detach()
        standard_ema = self.model_ema(x)
        # Augmentation-averaged Prediction
        N = 32
        outputs_emas = []
        to_aug = anchor_prob.mean() < 0.1  # Adjusted for binary classification
        if to_aug:
            for i in range(N):
                outputs_ = self.model_ema(self.transform(x)).detach()
                outputs_emas.append(outputs_)
            outputs_ema = torch.stack(outputs_emas).mean(0)
        else:
            outputs_ema = standard_ema
        # Student update
        loss = binary_entropy(outputs, outputs_ema.detach()).mean()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        # Teacher update
        self.model_ema = update_ema_variables(self.model_ema, self.model, alpha_teacher=0.999)
        # Stochastic restore
        for nm, m in self.model.named_modules():
            for npp, p in m.named_parameters():
                if p.requires_grad:
                    mask = (torch.rand(p.shape) < 0.001).float().to(p.device)
                    with torch.no_grad():
                        key = f"{nm}.{npp}" if nm else npp  # Corrected key construction
                        p.data = self.model_state[key] * mask + p * (1. - mask)
        return outputs_ema

Data Loading and Processing

In [414]:
def load_data(data_path):
    # Read feature mapping
    with open(os.path.join(data_path, 'feat.bid.txt'), 'r') as f:
        lines = f.readlines()
    feature_nums = int(lines[0].strip())
    # Assuming features are already mapped to integers in the data files

    # Read train data
    train_data = pd.read_csv(os.path.join(data_path, 'train.bid.txt'), header=None)
    # Read test data
    test_data = pd.read_csv(os.path.join(data_path, 'test.bid.txt'), header=None)

    return train_data, test_data, feature_nums

Dataset Class

In [415]:
class CTRDataset(Dataset):
    def __init__(self, data):
        # Data columns: click + winning price + hour + time_fraction + timestamp + features
        self.labels = data.iloc[:, 0].values.astype(np.float32)  # click labels
        self.features = data.iloc[:, 5:].values.astype(np.int64)  # features start from column 5

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        x = self.features[idx]
        y = self.labels[idx]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.float32)

## Model Implementations
### DeepFM Model

In [416]:
class DeepFM(nn.Module):
    def __init__(self, feature_nums, field_nums, embed_dim, mlp_dims, dropout):
        super(DeepFM, self).__init__()
        self.field_nums = field_nums

        # Linear part
        self.linear = nn.Embedding(feature_nums, 1)

        # FM part
        self.fm_embedding = nn.Embedding(feature_nums, embed_dim)

        # Deep part
        self.deep_embedding = nn.Embedding(feature_nums, embed_dim)
        deep_input_dim = field_nums * embed_dim
        layers = []
        for dim in mlp_dims:
            layers.extend([
                nn.Linear(deep_input_dim, dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            deep_input_dim = dim
        self.mlp = nn.Sequential(*layers)
        self.fc = nn.Linear(deep_input_dim, 1)

    def forward(self, x):
        # Linear part
        linear_out = self.linear(x).sum(1)  # Shape: (batch_size, 1)

        # FM part
        fm_emb = self.fm_embedding(x)
        sum_square = torch.sum(fm_emb, dim=1) ** 2
        square_sum = torch.sum(fm_emb ** 2, dim=1)
        fm_out = 0.5 * (sum_square - square_sum).sum(1, keepdim=True)  # Shape: (batch_size, 1)

        # Deep part
        deep_emb = self.deep_embedding(x).view(-1, self.field_nums * fm_emb.size(2))
        deep_out = self.mlp(deep_emb)
        deep_out = self.fc(deep_out)  # Shape: (batch_size, 1)

        # Output logits
        total_out = linear_out + fm_out + deep_out  # Shape: (batch_size, 1)
        return total_out  # Return logits00000000000fgh00

FNN Model

In [417]:
class FNN(nn.Module):
    def __init__(self, feature_nums, field_nums, embed_dim, mlp_dims, dropout):
        super(FNN, self).__init__()
        self.field_nums = field_nums

        # Embedding layer
        self.embedding = nn.Embedding(feature_nums, embed_dim)

        # MLP layers
        input_dim = field_nums * embed_dim
        layers = []
        for dim in mlp_dims:
            layers.extend([
                nn.Linear(input_dim, dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            input_dim = dim
        layers.append(nn.Linear(input_dim, 1))
        self.mlp = nn.Sequential(*layers)

    def forward(self, x):
        x_embed = self.embedding(x).view(-1, self.field_nums * self.embedding.embedding_dim)
        logits = self.mlp(x_embed)  # Shape: (batch_size, 1)
        return logits  # Return logits

DCN Model

In [418]:
class CrossNetwork(nn.Module):
    def __init__(self, input_dim, num_layers):
        super(CrossNetwork, self).__init__()
        self.num_layers = num_layers
        self.cross_layers = nn.ModuleList(
            [nn.Linear(input_dim, input_dim, bias=False) for _ in range(num_layers)]
        )
        self.cross_bias = nn.ParameterList(
            [nn.Parameter(torch.zeros(input_dim)) for _ in range(num_layers)]
        )

    def forward(self, x0):
        x = x0
        for i in range(self.num_layers):
            xw = self.cross_layers[i](x)
            x = x0 * xw + self.cross_bias[i] + x
        return x

class DCN(nn.Module):
    def __init__(self, feature_nums, field_nums, embed_dim, num_layers, mlp_dims, dropout):
        super(DCN, self).__init__()
        self.field_nums = field_nums

        # Embedding layer
        self.embedding = nn.Embedding(feature_nums, embed_dim)

        # Cross Network
        input_dim = field_nums * embed_dim
        self.cross_network = CrossNetwork(input_dim, num_layers)

        # Deep Network
        layers = []
        for dim in mlp_dims:
            layers.extend([
                nn.Linear(input_dim, dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            input_dim = dim
        self.deep_network = nn.Sequential(*layers)

        # Final output layer
        self.fc = nn.Linear(input_dim + field_nums * embed_dim, 1)

    def forward(self, x):
        x_embed = self.embedding(x).view(-1, self.field_nums * self.embedding.embedding_dim)
        x_cross = self.cross_network(x_embed)
        x_deep = self.deep_network(x_embed)
        x_stack = torch.cat([x_cross, x_deep], dim=1)
        logits = self.fc(x_stack)  # Shape: (batch_size, 1)
        return logits  # Return logits

AFM Model

In [419]:
class AFM(nn.Module):
    def __init__(self, feature_nums, field_nums, embed_dim, attn_size, dropout):
        super(AFM, self).__init__()
        self.field_nums = field_nums
        self.embed_dim = embed_dim

        # Embedding layer
        self.embedding = nn.Embedding(feature_nums, embed_dim)

        # Attention mechanism
        self.attention_fc = nn.Sequential(
            nn.Linear(embed_dim, attn_size),
            nn.ReLU(),
            nn.Linear(attn_size, 1)
        )
        self.dropout = nn.Dropout(dropout)

        # Prediction layer
        self.fc = nn.Linear(embed_dim, 1)

        # Linear part
        self.linear = nn.Embedding(feature_nums, 1)

    def forward(self, x):
        x_embed = self.embedding(x)
        linear_out = self.linear(x).sum(1)

        # Pairwise interactions
        interactions = []
        for i in range(self.field_nums):
            for j in range(i + 1, self.field_nums):
                interactions.append(x_embed[:, i, :] * x_embed[:, j, :])
        interactions = torch.stack(interactions, dim=1)  # Shape: (batch_size, num_pairs, embed_dim)

        # Attention mechanism
        attn_scores = self.attention_fc(interactions).squeeze(-1)  # Shape: (batch_size, num_pairs)
        attn_scores = torch.softmax(attn_scores, dim=1)
        attn_output = torch.sum(attn_scores.unsqueeze(-1) * interactions, dim=1)  # Shape: (batch_size, embed_dim)
        attn_output = self.dropout(attn_output)

        # Prediction without sigmoid
        logits = linear_out + self.fc(attn_output)  # Shape: (batch_size, 1)
        return logits  # Return logits

## Training and Testing with CoTTA
### Main Function

In [420]:
def main():
    data_path = '/home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/data/ipinyou/1458/'  # Adjust this path as needed
    train_data, test_data, feature_nums = load_data(data_path)
    field_nums = train_data.shape[1] - 5  # Subtract non-feature columns

    # Split train data into training and validation sets
    from sklearn.model_selection import train_test_split
    train_df, val_df = train_test_split(train_data, test_size=0.2, random_state=42)

    # Create datasets and loaders
    train_dataset = CTRDataset(train_df)
    val_dataset = CTRDataset(val_df)
    test_dataset = CTRDataset(test_data)

    batch_size = 1024
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=4)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=4)

    # Model parameters
    model_name = 'AFM'  # Change this to 'FNN', 'DCN', or 'AFM' as needed
    embed_dim = 10
    mlp_dims = [300, 300, 300]
    dropout = 0.2
    num_layers = 5  # For DCN
    attn_size = 32  # For AFM
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    epochs = 100  # Increased to show early stopping effect

    # Get model
    if model_name == 'DeepFM':
        model = DeepFM(feature_nums, field_nums, embed_dim, mlp_dims, dropout).to(device)
    elif model_name == 'FNN':
        model = FNN(feature_nums, field_nums, embed_dim, mlp_dims, dropout).to(device)
    elif model_name == 'DCN':
        model = DCN(feature_nums, field_nums, embed_dim, num_layers, mlp_dims, dropout).to(device)
    elif model_name == 'AFM':
        model = AFM(feature_nums, field_nums, embed_dim, attn_size, dropout).to(device)
    else:
        raise ValueError('Unknown model name')

    # Loss and optimizer
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    # Training with early stopping
    train_model(model, train_loader, val_loader, criterion, optimizer, device, epochs, early_stopping_patience=5)

    # Load best model
    model.load_state_dict(torch.load('best_model.pth'))

    # Configure model for CoTTA
    model = configure_model(model)
    params, param_names = collect_params(model)
    optimizer = torch.optim.SGD(params, lr=0.0001)
    cotta_model = CoTTA(model, optimizer, steps=1, episodic=False)

    # Testing with CoTTA
    model.eval()
    y_true = []
    y_scores = []

    for x_batch, y_batch in test_loader:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        outputs = cotta_model(x_batch)  # Outputs are logits
        probabilities = torch.sigmoid(outputs).detach().cpu().numpy().flatten()

        y_true.extend(y_batch.cpu().numpy())
        y_scores.extend(probabilities)

    test_auc = roc_auc_score(y_true, y_scores)
    print(f'Test AUC with CoTTA: {test_auc:.4f}')

Training Function

In [421]:
def train_model(model, train_loader, valid_loader, criterion, optimizer, device, epochs, early_stopping_patience=10):
    best_loss = float('inf')
    epochs_no_improve = 0
    early_stop = False

    for epoch in range(epochs):
        if early_stop:
            print("Early stopping")
            break

        model.train()
        total_loss = 0
        for x_batch, y_batch in train_loader:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            optimizer.zero_grad()
            logits = model(x_batch).squeeze()
            loss = criterion(logits, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)

        # Validation
        model.eval()
        val_total_loss = 0
        y_true = []
        y_scores = []
        with torch.no_grad():
            for x_val, y_val in valid_loader:
                x_val = x_val.to(device)
                y_val = y_val.to(device)
                logits = model(x_val).squeeze()
                loss = criterion(logits, y_val)
                val_total_loss += loss.item()
                y_pred = torch.sigmoid(logits)
                y_true.extend(y_val.cpu().numpy())
                y_scores.extend(y_pred.cpu().numpy())
        val_avg_loss = val_total_loss / len(valid_loader)
        val_auc = roc_auc_score(y_true, y_scores)
        print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, Val Loss: {val_avg_loss:.4f}, Validation AUC: {val_auc:.4f}')

        # Check for early stopping
        if val_avg_loss < best_loss:
            best_loss = val_avg_loss
            epochs_no_improve = 0
            # Save best model
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= early_stopping_patience:
                print(f"Early stopping after {epoch+1} epochs")
                early_stop = True

    print(f'Best Validation Loss: {best_loss:.4f}')

In [422]:
if __name__ == '__main__':
    main()

Epoch 1/100, Loss: 0.3521, Val Loss: 0.0304, Validation AUC: 0.4908
Epoch 2/100, Loss: 0.0204, Val Loss: 0.0107, Validation AUC: 0.4961
Epoch 3/100, Loss: 0.0115, Val Loss: 0.0097, Validation AUC: 0.5011
Epoch 4/100, Loss: 0.0104, Val Loss: 0.0095, Validation AUC: 0.5073
Epoch 5/100, Loss: 0.0098, Val Loss: 0.0091, Validation AUC: 0.5141
Epoch 6/100, Loss: 0.0093, Val Loss: 0.0087, Validation AUC: 0.5204
Epoch 7/100, Loss: 0.0088, Val Loss: 0.0083, Validation AUC: 0.5260
Epoch 8/100, Loss: 0.0083, Val Loss: 0.0080, Validation AUC: 0.5339
Epoch 9/100, Loss: 0.0080, Val Loss: 0.0077, Validation AUC: 0.5414
Epoch 10/100, Loss: 0.0077, Val Loss: 0.0075, Validation AUC: 0.5508
Epoch 11/100, Loss: 0.0075, Val Loss: 0.0073, Validation AUC: 0.5616
Epoch 12/100, Loss: 0.0073, Val Loss: 0.0072, Validation AUC: 0.5720
Epoch 13/100, Loss: 0.0071, Val Loss: 0.0070, Validation AUC: 0.5827
Epoch 14/100, Loss: 0.0069, Val Loss: 0.0069, Validation AUC: 0.5968
Epoch 15/100, Loss: 0.0068, Val Loss: 0.006

KeyboardInterrupt: 