In [49]:
import os
import pickle
import copy
from copy import deepcopy
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score
import torch.nn.functional as F

# 1. Utility Functions

In [50]:
def weight_init(layers):
    for layer in layers:
        if isinstance(layer, nn.BatchNorm1d):
            layer.weight.data.fill_(1)
            layer.bias.data.zero_()
        elif isinstance(layer, nn.Linear):
            n = layer.in_features
            y = 1.0 / np.sqrt(n)
            layer.weight.data.uniform_(-y, y)
            layer.bias.data.fill_(0)
            # nn.init.kaiming_normal_(layer.weight.data, nonlinearity='relu')

# 2. Data Handling Functions

In [51]:
def parse_line(line):
    parts = line.strip().split('\t')
    label = int(parts[0])
    # Skip parts[1] (the second element)
    feature_ids = []
    for item in parts[2:]:
        feature_id = int(item.split(':')[0])
        feature_ids.append(feature_id)
    return label, feature_ids

def build_feature_mapping(file_paths):
    feature_set = set()
    for file_path in file_paths:
        with open(file_path, 'r') as f:
            for line in f:
                _, feature_ids = parse_line(line)
                feature_set.update(feature_ids)
    feature_list = sorted(feature_set)
    feature_id_map = {feature_id: idx + 1 for idx, feature_id in enumerate(feature_list)}  # Start from 1
    return feature_id_map, len(feature_id_map) + 1  # +1 to account for padding index 0

def compute_max_length(file_paths):
    max_length = 0
    for file_path in file_paths:
        with open(file_path, 'r') as f:
            for line in f:
                _, feature_ids = parse_line(line)
                max_length = max(max_length, len(feature_ids))
    return max_length

# 3. Custom IterableDataset

In [52]:
class YOYIDataset(torch.utils.data.IterableDataset):
    def __init__(self, file_path, max_length, feature_id_map):
        super(YOYIDataset).__init__()
        self.file_path = file_path
        self.max_length = max_length
        self.feature_id_map = feature_id_map

    def __iter__(self):
        worker_info = torch.utils.data.get_worker_info()
        if worker_info is None:
            # Single-process data loading
            yield from self._data_generator(self.file_path)
        else:
            # Multi-process data loading
            total_workers = worker_info.num_workers
            worker_id = worker_info.id
            # Split workload among workers
            yield from self._data_generator(self.file_path, worker_id, total_workers)

    def _data_generator(self, file_path, worker_id=0, total_workers=1):
        with open(file_path, 'r') as f:
            for idx, line in enumerate(f):
                if idx % total_workers != worker_id:
                    continue
                label, feature_ids = parse_line(line)
                # Map feature IDs
                mapped_feature_ids = [self.feature_id_map.get(fid, 0) for fid in feature_ids]
                # Pad feature_ids to max_length
                padded_feature_ids = mapped_feature_ids + [0] * (self.max_length - len(mapped_feature_ids))
                x = torch.tensor(padded_feature_ids, dtype=torch.long)
                y = torch.tensor(label, dtype=torch.float32)
                yield x, y

def create_data_loader(file_path, max_length, batch_size, num_workers, feature_id_map):
    dataset = YOYIDataset(file_path, max_length, feature_id_map)
    data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)
    return data_loader

# 4. Model Definitions
### DeepFM Model

In [53]:
class DeepFM(nn.Module):
    def __init__(self, feature_nums, field_nums, latent_dims, output_dim=1):
        super(DeepFM, self).__init__()
        self.field_nums = field_nums
        self.latent_dims = latent_dims
        self.feature_nums = feature_nums

        # Linear part
        self.linear = nn.Embedding(self.feature_nums, output_dim, padding_idx=0)

        # FM embedding
        self.feature_embedding = nn.Embedding(self.feature_nums, self.latent_dims, padding_idx=0)
        nn.init.xavier_uniform_(self.feature_embedding.weight.data)

        # MLP
        deep_input_dims = self.field_nums * self.latent_dims
        layers = []

        neuron_nums = [300, 300, 300]
        for neuron_num in neuron_nums:
            layers.append(nn.Linear(deep_input_dims, neuron_num))
            # layers.append(nn.BatchNorm1d(neuron_num))  # Uncomment if needed
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(p=0.2))
            deep_input_dims = neuron_num

        layers.append(nn.Linear(deep_input_dims, 1))

        # Initialize weights
        weight_init(layers)

        self.mlp = nn.Sequential(*layers)

    def forward(self, x):
        # Linear Part
        linear_out = torch.sum(self.linear(x), dim=1)  # Shape: (batch_size, output_dim)

        # FM Part
        embedding_x = self.feature_embedding(x)
        square_of_sum = torch.sum(embedding_x, dim=1) ** 2
        sum_of_square = torch.sum(embedding_x ** 2, dim=1)
        ix = 0.5 * torch.sum(square_of_sum - sum_of_square, dim=1, keepdim=True)  # Shape: (batch_size, 1)

        # Deep Part
        deep_out = self.mlp(embedding_x.view(-1, self.field_nums * self.latent_dims))  # Shape: (batch_size, 1)

        # Output
        out = linear_out + ix + deep_out  # Shape: (batch_size, 1)

        return out  # Return logits

FNN Model

In [54]:
class FNN(nn.Module):
    def __init__(self, feature_nums, field_nums, latent_dims):
        super(FNN, self).__init__()
        self.field_nums = field_nums
        self.latent_dims = latent_dims

        # Embedding layer
        self.feature_embedding = nn.Embedding(feature_nums, latent_dims, padding_idx=0)
        nn.init.xavier_uniform_(self.feature_embedding.weight.data)

        # MLP
        deep_input_dims = field_nums * latent_dims
        layers = []

        neuron_nums = [300, 300, 300]
        for neuron_num in neuron_nums:
            layers.append(nn.Linear(deep_input_dims, neuron_num))
            # layers.append(nn.BatchNorm1d(neuron_num))  # Uncomment if needed
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))
            deep_input_dims = neuron_num

        layers.append(nn.Linear(deep_input_dims, 1))

        # Initialize weights
        weight_init(layers)

        self.mlp = nn.Sequential(*layers)

    def forward(self, x):
        embedding_x = self.feature_embedding(x)
        out = self.mlp(embedding_x.view(-1, self.field_nums * self.latent_dims))
        return out  # Return logits

DCN Model

In [55]:
class DCN(nn.Module):
    def __init__(self, feature_nums, field_nums, latent_dims, output_dim=1):
        super(DCN, self).__init__()
        self.field_nums = field_nums
        self.latent_dims = latent_dims

        # Embedding layer
        self.feature_embedding = nn.Embedding(feature_nums, latent_dims, padding_idx=0)
        nn.init.xavier_uniform_(self.feature_embedding.weight.data)

        # Deep Network
        deep_input_dims = field_nums * latent_dims
        deep_net_layers = []
        neural_nums = [300, 300, 300]
        self.num_neural_layers = 5  # Number of layers in the cross network

        for neural_num in neural_nums:
            deep_net_layers.append(nn.Linear(deep_input_dims, neural_num))
            # deep_net_layers.append(nn.BatchNorm1d(neural_num))  # Uncomment if needed
            deep_net_layers.append(nn.ReLU())
            deep_net_layers.append(nn.Dropout(0.2))
            deep_input_dims = neural_num

        # Initialize weights
        weight_init(deep_net_layers)

        self.DN = nn.Sequential(*deep_net_layers)

        # Cross Network
        cross_input_dims = field_nums * latent_dims
        self.cross_net_w = nn.ModuleList([
            nn.Linear(cross_input_dims, cross_input_dims) for _ in range(self.num_neural_layers)
        ])

        # Initialize weights for cross network
        weight_init(self.cross_net_w)

        self.cross_net_b = nn.ParameterList([
            nn.Parameter(torch.zeros(cross_input_dims)) for _ in range(self.num_neural_layers)
        ])

        # Final Linear Layer
        self.linear = nn.Linear(deep_input_dims + cross_input_dims, output_dim)
        # nn.init.xavier_normal_(self.linear.weight)

    def forward(self, x):
        embedding_x = self.feature_embedding(x).view(-1, self.field_nums * self.latent_dims)

        # Cross Network
        cn_x0 = embedding_x
        cn_x = embedding_x
        for i in range(self.num_neural_layers):
            cn_x_w = self.cross_net_w[i](cn_x)
            cn_x = cn_x0 * cn_x_w + self.cross_net_b[i] + cn_x

        # Deep Network
        dn_x = self.DN(embedding_x)

        # Concatenate
        x_stack = torch.cat([cn_x, dn_x], dim=1)

        # Final output
        out = self.linear(x_stack)

        return out  # Return logits

AFM Model

In [56]:
class AFM(nn.Module):
    def __init__(self, feature_nums, field_nums, latent_dims, output_dim=1):
        super(AFM, self).__init__()
        self.field_nums = field_nums
        self.latent_dims = latent_dims

        # Embedding layer
        self.feature_embedding = nn.Embedding(feature_nums, latent_dims, padding_idx=0)
        nn.init.xavier_uniform_(self.feature_embedding.weight.data)

        # Prepare index pairs for interactions
        self.row, self.col = [], []
        for i in range(self.field_nums - 1):
            for j in range(i + 1, self.field_nums):
                self.row.append(i)
                self.col.append(j)

        attention_factor = self.latent_dims

        # Attention network
        self.attention_net = nn.Linear(self.latent_dims, attention_factor)
        n = self.attention_net.in_features
        y = 1.0 / np.sqrt(n)
        self.attention_net.weight.data.uniform_(-y, y)
        self.attention_net.bias.data.fill_(0)

        self.attention_softmax = nn.Linear(attention_factor, 1)

        # Output layers
        self.fc = nn.Linear(self.latent_dims, output_dim)

        # Linear part
        self.linear = nn.Embedding(feature_nums, output_dim, padding_idx=0)
        self.bias = nn.Parameter(torch.zeros((output_dim,)))

    def forward(self, x):
        embedding_x = self.feature_embedding(x)

        # Pairwise interactions
        row_emb = embedding_x[:, self.row]  # Shape: (batch_size, num_pairs, latent_dims)
        col_emb = embedding_x[:, self.col]  # Shape: (batch_size, num_pairs, latent_dims)
        inner_product = row_emb * col_emb  # Element-wise multiplication

        # Attention mechanism
        attn_scores = F.relu(self.attention_net(inner_product))  # Shape: (batch_size, num_pairs, attention_factor)
        attn_scores = F.softmax(self.attention_softmax(attn_scores), dim=1)  # Shape: (batch_size, num_pairs, 1)
        attn_scores = F.dropout(attn_scores, p=0.2)

        # Weighted sum of interactions
        attn_output = torch.sum(attn_scores * inner_product, dim=1)  # Shape: (batch_size, latent_dims)
        attn_output = F.dropout(attn_output, p=0.2)

        # Output
        linear_part = self.bias + torch.sum(self.linear(x), dim=1)  # Shape: (batch_size, output_dim)
        out = linear_part + self.fc(attn_output)  # Shape: (batch_size, output_dim)

        return out  # Return logits

# 5. Training Function

In [57]:
def train_model(model, train_loader, valid_loader, criterion, optimizer, device, epochs, early_stopping_patience=2):
    best_loss = float('inf')
    epochs_no_improve = 0
    early_stop = False

    for epoch in range(epochs):
        if early_stop:
            print("Early stopping")
            break

        model.train()
        total_loss = 0
        num_batches = 0  # Keep track of the number of batches
        for x_batch, y_batch in train_loader:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            optimizer.zero_grad()
            logits = model(x_batch).squeeze()
            loss = criterion(logits, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            num_batches += 1
        avg_loss = total_loss / num_batches if num_batches > 0 else 0
        print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}')

        # Validation
        if valid_loader is not None:
            model.eval()
            val_total_loss = 0
            num_val_batches = 0  # Keep track of the number of validation batches
            y_true = []
            y_scores = []
            with torch.no_grad():
                for x_val, y_val in valid_loader:
                    x_val = x_val.to(device)
                    y_val = y_val.to(device)
                    logits = model(x_val).squeeze()
                    loss = criterion(logits, y_val)
                    val_total_loss += loss.item()
                    num_val_batches += 1
                    y_pred = torch.sigmoid(logits)
                    y_true.extend(y_val.cpu().numpy())
                    y_scores.extend(y_pred.cpu().numpy())
            val_avg_loss = val_total_loss / num_val_batches if num_val_batches > 0 else 0
            val_auc = roc_auc_score(y_true, y_scores)
            print(f'Val Loss: {val_avg_loss:.4f}, Validation AUC: {val_auc:.4f}')

            # Check for early stopping
            if val_avg_loss < best_loss:
                best_loss = val_avg_loss
                epochs_no_improve = 0
                # Save best model
                torch.save(model.state_dict(), 'base_best_model_AFM.pth')
            else:
                epochs_no_improve += 1
                if epochs_no_improve >= early_stopping_patience:
                    print(f"Early stopping after {epoch+1} epochs")
                    early_stop = True
        else:
            # Save model every epoch if no validation set
            torch.save(model.state_dict(), 'base_best_model_AFM.pth')

    if valid_loader is not None:
        print(f'Best Validation Loss: {best_loss:.4f}')

# 6. Testing Function

In [58]:
def test_model(model, test_loader, device):
    model.eval()
    y_true = []
    y_scores = []

    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)

            logits = model(x_batch).squeeze()
            probabilities = torch.sigmoid(logits).cpu().numpy().flatten()

            y_true.extend(y_batch.cpu().numpy())
            y_scores.extend(probabilities)

    test_auc = roc_auc_score(y_true, y_scores)
    print(f'Test AUC: {test_auc:.4f}')
    return test_auc


# 7. Main Function

In [59]:
def main():
    data_path = '/home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/make-ipinyou-data/cikm2016-yoyi-dataset/'  # Adjust this path as needed

    train_file = os.path.join(data_path, 'train_set.txt')
    test_file = os.path.join(data_path, 'test_set.txt')

    # Paths to save feature mapping and max_length
    feature_map_path = os.path.join(data_path, 'feature_id_map.pkl')
    max_length_path = os.path.join(data_path, 'max_length.pkl')

    # Check if feature mapping and max_length files exist
    if os.path.exists(feature_map_path) and os.path.exists(max_length_path):
        print("Loading feature mapping and max_length from disk...")
        with open(feature_map_path, 'rb') as f:
            feature_id_map, feature_nums = pickle.load(f)
        with open(max_length_path, 'rb') as f:
            max_length = pickle.load(f)
    else:
        # Build feature mapping and get feature_nums
        print("Building feature mapping...")
        feature_id_map, feature_nums = build_feature_mapping([train_file, test_file])
        print(f"Total number of features: {feature_nums}")

        # Compute max_length
        print("Computing maximum feature length...")
        max_length = compute_max_length([train_file, test_file])
        print(f"Maximum feature length: {max_length}")

        # Save feature mapping and max_length to disk
        with open(feature_map_path, 'wb') as f:
            pickle.dump((feature_id_map, feature_nums), f)
        with open(max_length_path, 'wb') as f:
            pickle.dump(max_length, f)

    field_nums = max_length  # Since we've padded features to max_length

    batch_size = 2048
    num_workers = 12  # Adjust based on your system

    # Create data loaders
    print("Creating data loaders...")
    train_loader = create_data_loader(train_file, max_length, batch_size, num_workers, feature_id_map)
    test_loader = create_data_loader(test_file, max_length, batch_size, num_workers, feature_id_map)

    # Since we cannot split the data for validation easily, we'll set val_loader to None
    val_loader = None  # Set to None if not using validation

    # Model parameters
    model_name = 'AFM'  # Change this to 'DeepFM', 'FNN', 'DCN', or 'AFM' as needed
    latent_dims = 10
    dropout = 0.2
    num_layers = 5  # For DCN
    attn_size = 32  # For AFM
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    epochs = 1  # Adjust as needed

    # Get model
    print(f"Initializing model: {model_name}")
    if model_name == 'DeepFM':
        model = DeepFM(feature_nums, field_nums, latent_dims).to(device)
    elif model_name == 'FNN':
        model = FNN(feature_nums, field_nums, latent_dims).to(device)
    elif model_name == 'DCN':
        model = DCN(feature_nums, field_nums, latent_dims).to(device)
    elif model_name == 'AFM':
        model = AFM(feature_nums, field_nums, latent_dims).to(device)
    else:
        raise ValueError('Unknown model name')

    # Loss and optimizer
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    # Training
    print("Starting training...")
    train_model(model, train_loader, val_loader, criterion, optimizer, device, epochs, early_stopping_patience=2)

    # Load best model
    #model.load_state_dict(torch.load('base_best_model.pth'))

    # Testing
    print("Starting testing...")
    test_auc = test_model(model, test_loader, device)

In [60]:
if __name__ == '__main__':
    main()

Loading feature mapping and max_length from disk...
Creating data loaders...
Initializing model: AFM
Starting training...
Epoch 1/1, Loss: 0.0305
Starting testing...
Test AUC: 0.8409
