In [19]:
import torch
import torch.nn as nn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset

# Device configuration
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [20]:
import torch.nn.functional as F

# RMSE Loss
def rmse_loss(predictions, targets):
    return torch.sqrt(F.mse_loss(predictions, targets))

In [21]:
# Data preparation
def load_and_process_data():
    # Load the Ratings data
    data = pd.read_csv('ml-100k/u.data', sep="\t", header=None)
    data.columns = ['user id', 'movie id', 'rating', 'timestamp']
    
    # Load the User data
    users = pd.read_csv('ml-100k/u.user', sep="|", encoding='latin-1', header=None)
    users.columns = ['user id', 'age', 'gender', 'occupation', 'zip code']
    
    # Load Movie data
    items = pd.read_csv('ml-100k/u.item', sep="|", encoding='latin-1', header=None)
    items.columns = ['movie id', 'movie title', 'release date', 'video release date', 'IMDb URL', 
                     'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 
                     'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
                     'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

    # Merge datasets
    dataset = data.merge(users, on='user id', how='left').merge(items, on='movie id', how='left')

    # Encode categorical features
    label_encoder = LabelEncoder()
    dataset['gender'] = (dataset['gender'] == 'M').astype(int)
    dataset['occupation'] = label_encoder.fit_transform(dataset['occupation'])

    # Convert age into intervals (bins) and encode as integers
    bins = [0, 18, 25, 35, 45, 50, 60, 100]
    labels = [0, 1, 2, 3, 4, 5, 6]
    dataset['age'] = pd.cut(dataset['age'], bins=bins, labels=labels).astype(int)

    # Normalize ratings
    dataset['rating'] = dataset['rating'] / dataset['rating'].max()

    # Drop irrelevant columns
    dataset.drop(['zip code', 'movie title', 'release date', 'IMDb URL', 'timestamp'], axis=1, inplace=True)

    # Split into train, validation, and test sets
    train_data, temp_data = train_test_split(dataset, test_size=0.3, random_state=42)
    valid_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

    return train_data, valid_data, test_data

In [22]:
class MovieLensDataset(Dataset):
    def __init__(self, data, field_dims, device):
        self.data = data
        self.field_dims = field_dims
        self.numerical_cols = data.columns.difference(['rating'])
        self.target_col = 'rating'
        self.device = device

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        features = torch.tensor(row[self.numerical_cols].values, dtype=torch.long, device=self.device)  # Long type
        target = torch.tensor(row[self.target_col], dtype=torch.float32, device=self.device)  # Float type
        return features, target

In [23]:
class DeepFM(nn.Module):
    def __init__(self, field_dims, num_factors, mlp_dims, drop_rate=0.1):
        """
        Args:
            field_dims (list): List of integers where each entry is the number of unique values for a feature field.
            num_factors (int): Size of the embedding vector for each field.
            mlp_dims (list): List of integers defining the number of units in each MLP layer.
            drop_rate (float): Dropout rate for MLP layers.
        """
        super(DeepFM, self).__init__()
        self.num_fields = len(field_dims)
        self.num_factors = num_factors

        # Embedding layers for each field
        self.embeddings = nn.ModuleList([
            nn.Embedding(field_dim, num_factors) for field_dim in field_dims
        ])

        # Linear part (first-order terms)
        self.linear_layers = nn.ModuleList([
            nn.Embedding(field_dim, 1) for field_dim in field_dims
        ])

        # Multi-Layer Perceptron (MLP) for deep part
        input_dim = self.num_fields * num_factors  # Corrected input dimension
        mlp_layers = []
        for dim in mlp_dims:
            mlp_layers.append(nn.Linear(input_dim, dim))
            mlp_layers.append(nn.ReLU())
            mlp_layers.append(nn.Dropout(drop_rate))
            input_dim = dim
        self.mlp = nn.Sequential(*mlp_layers)

        # Final layer for deep part output
        self.mlp_output = nn.Linear(input_dim, 1)

    def forward(self, x):
        """
        Forward pass of DeepFM.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, num_fields).

        Returns:
            torch.Tensor: Output tensor of shape (batch_size, 1).
        """
        # Embedding lookup
        embed_x = torch.cat([
            embed(x[:, i]).unsqueeze(1) for i, embed in enumerate(self.embeddings)
        ], dim=1)  # Shape: (batch_size, num_fields, num_factors)

        # FM: Second-order interactions
        square_of_sum = torch.sum(embed_x, dim=1) ** 2
        sum_of_square = torch.sum(embed_x ** 2, dim=1)
        fm_second_order = 0.5 * torch.sum(square_of_sum - sum_of_square, dim=1, keepdim=True)

        # Linear part (first-order terms)
        linear_part = torch.cat([
            linear(x[:, i]).unsqueeze(1) for i, linear in enumerate(self.linear_layers)
        ], dim=1).sum(1)  # Shape: (batch_size, 1)

        # Deep part (MLP)
        deep_input = embed_x.view(embed_x.size(0), -1)  # Flatten embeddings to (batch_size, num_fields * num_factors)
        deep_output = self.mlp(deep_input)
        deep_output = self.mlp_output(deep_output)

        # Final output
        output = linear_part + fm_second_order + deep_output
        return torch.sigmoid(output)

In [24]:
def train_deepfm(model, train_loader, valid_loader, test_loader, num_epochs=10, lr=1e-3):
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    best_valid_loss = float('inf')
    best_model_state = None

    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss = 0
        for features, target in train_loader:
            optimizer.zero_grad()
            predictions = model(features)
            loss = rmse_loss(predictions, target)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        train_loss /= len(train_loader)

        # Validation
        model.eval()
        valid_loss = 0
        with torch.no_grad():
            for features, target in valid_loader:
                predictions = model(features)
                loss = rmse_loss(predictions, target)
                valid_loss += loss.item()
        
        valid_loss /= len(valid_loader)

        print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {valid_loss:.4f}")

        # Save the best model
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            best_model_state = model.state_dict()

    print(f"Training completed. Best Validation Loss: {best_valid_loss:.4f}")

    # Load the best model state
    model.load_state_dict(best_model_state)

    # Test the model
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for features, target in test_loader:
            predictions = model(features)
            loss = rmse_loss(predictions, target)
            test_loss += loss.item()
    
    test_loss /= len(test_loader)
    print(f"Test Loss: {test_loss:.4f}")

In [25]:
train_data, valid_data, test_data = load_and_process_data()

In [26]:
train_data.iloc[1]

user id               622.0
movie id              206.0
rating                  0.2
age                     1.0
gender                  1.0
occupation             14.0
video release date      NaN
unknown                 0.0
Action                  0.0
Adventure               1.0
Animation               1.0
Children's              0.0
Comedy                  0.0
Crime                   0.0
Documentary             0.0
Drama                   0.0
Fantasy                 0.0
Film-Noir               0.0
Horror                  0.0
Musical                 0.0
Mystery                 0.0
Romance                 0.0
Sci-Fi                  1.0
Thriller                1.0
War                     0.0
Western                 0.0
Name: 60406, dtype: float64

In [27]:
# Compute field_dims
field_dims = [
    train_data['user id'].nunique(),
    train_data['movie id'].nunique(),
    train_data['gender'].nunique(),
    train_data['occupation'].nunique(),
    train_data['age'].nunique(),
    train_data['Action'].nunique(),
    train_data['Adventure'].nunique(),
    train_data['Animation'].nunique(),
    train_data['Children\'s'].nunique(),
    train_data['Comedy'].nunique(),
    train_data['Crime'].nunique(),
    train_data['Documentary'].nunique(),
    train_data['Drama'].nunique(),
    train_data['Fantasy'].nunique(),
    train_data['Film-Noir'].nunique(),
    train_data['Horror'].nunique(),
    train_data['Musical'].nunique(),
    train_data['Mystery'].nunique(),
    train_data['Romance'].nunique(),
    train_data['Sci-Fi'].nunique(),
    train_data['Thriller'].nunique(),
    train_data['War'].nunique(),
    train_data['Western'].nunique(),
    train_data['unknown'].nunique()
]

# Initialize datasets and loaders with device handling
train_dataset = MovieLensDataset(train_data, field_dims, device=device)
valid_dataset = MovieLensDataset(valid_data, field_dims, device=device)
test_dataset = MovieLensDataset(test_data, field_dims, device=device)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [29]:
field_dims

[943, 1631, 2, 21, 7, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]

In [28]:
# Initialize and train DeepFM model
deepfm_model = DeepFM(
    field_dims=field_dims,
    num_factors=32,  # Embedding size
    mlp_dims=[128, 64, 32],  # MLP layers
    drop_rate=0.1  # Dropout rate
).to(device)

train_deepfm(
    deepfm_model,
    train_loader,
    valid_loader,
    test_loader,
    num_epochs=10,
    lr=5e-4
)

IndexError: index out of range in self