## 1. Setup and Imports

We start by importing PyTorch and preparing our user–item interaction matrix from the training data.


In [None]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix


## Load and clean every important files

*Applying `eval` to each entry in the corresponding column, converting string representations of Python literals into their actual Python objects.*

Here, cleaning is quite straightforward: it is just about removing null values and duplicates.

In [1]:
from utils import load_and_clean_data

big_matrix, small_matrix, social_network, item_categories, user_features, item_daily_features = load_and_clean_data()

Loading datasets...


: 

## 2. Prepare User-Item Interaction Matrix

We create a dense matrix `users × items` with binary entries:  
1 if the user had a positive interaction (`watch_ratio ≥ 2`), 0 otherwise.


In [147]:
# Load and binarize data
train_df = big_matrix.copy()
train_df['interaction'] = (train_df['watch_ratio'] >= 2).astype(int)

# Build user/item index mappings
user_ids = train_df['user_id'].unique().tolist()
item_ids = train_df['video_id'].unique().tolist()
user2idx = {u:i for i,u in enumerate(user_ids)}
item2idx = {v:i for i,v in enumerate(item_ids)}

n_users = len(user_ids)
n_items = len(item_ids)

# Create sparse interaction matrix
train_df['u_idx'] = train_df['user_id'].map(user2idx)
train_df['i_idx'] = train_df['video_id'].map(item2idx)

interaction_matrix = csr_matrix(
    ((train_df['watch_ratio'] >= 2).astype(int),
     (train_df['u_idx'], train_df['i_idx'])),
    shape=(n_users, n_items)
).toarray()


## 3. Define the Autoencoder Architecture

We use a simple architecture:  
**Input → Hidden (ReLU) → Output (Sigmoid)**  
This allows the model to compress user preferences into a latent space and then decode predictions over all items.


In [148]:
class DeepAutoEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dims=[512, 256], dropout=0.3):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dims[0]),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dims[0], hidden_dims[1]),
            nn.ReLU(),
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dims[1], hidden_dims[0]),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dims[0], input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        z = self.encoder(x)
        return self.decoder(z)


## 4. Create Dataset and DataLoader

Each sample is a binary vector of a user's interactions over all items.


In [149]:
class InteractionDataset(Dataset):
    def __init__(self, user_item_matrix):
        self.data = torch.FloatTensor(user_item_matrix).clamp(0, 1)  # safe clipping

    def __len__(self):
        return self.data.size(0)

    def __getitem__(self, idx):
        return self.data[idx]

dataset = InteractionDataset(interaction_matrix)
train_loader = DataLoader(dataset, batch_size=128, shuffle=True)


## 5. Train the Autoencoder

We use binary cross-entropy loss, and optimize using Adam.


In [150]:
model = DeepAutoEncoder(input_dim=n_items, hidden_dims=[256, 128], dropout=0.3)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()

# Use the same DataLoader as before
epochs = 20
model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        output = model(batch)
        loss = criterion(output, batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss:.4f}")


Epoch 1/20 - Loss: 39.3665
Epoch 2/20 - Loss: 38.3540
Epoch 3/20 - Loss: 33.8918
Epoch 4/20 - Loss: 24.2534
Epoch 5/20 - Loss: 15.3866
Epoch 6/20 - Loss: 11.0040
Epoch 7/20 - Loss: 8.9717
Epoch 8/20 - Loss: 7.8537
Epoch 9/20 - Loss: 7.1200
Epoch 10/20 - Loss: 6.5308
Epoch 11/20 - Loss: 6.1354
Epoch 12/20 - Loss: 5.6918
Epoch 13/20 - Loss: 5.3319
Epoch 14/20 - Loss: 5.0464
Epoch 15/20 - Loss: 4.6999
Epoch 16/20 - Loss: 4.5002
Epoch 17/20 - Loss: 4.2833
Epoch 18/20 - Loss: 4.0819
Epoch 19/20 - Loss: 3.9691
Epoch 20/20 - Loss: 3.8534


## 6. Generate Recommendations

For a user, we score all items, then exclude already seen ones and keep the top-K.


In [151]:
def recommend_top_k(user_idx, K=10):
    model.eval()
    with torch.no_grad():
        input_vec = torch.FloatTensor(interaction_matrix[user_idx]).unsqueeze(0)
        scores = model(input_vec).squeeze().numpy()
        
        # Exclude seen items
        seen = set(np.where(interaction_matrix[user_idx] == 1)[0])
        scores[list(seen)] = -1  # mask

        # Get top K item indices
        top_items = np.argsort(scores)[-K:][::-1]
        return [item_ids[i] for i in top_items]

# Example
user_index = 0
print("Recommended items for user:", recommend_top_k(user_index))


Recommended items for user: [314, 8366, 3400, 2894, 4123, 2130, 1305, 5434, 3723, 5525]


## 7. Evaluation: Top-K Metrics

We evaluate our autoencoder-based recommender using standard ranking metrics:
- **Precision@K**: how many of the top-K predictions are relevant?
- **Recall@K**: how much of the user's true positives are recovered?
- **NDCG@K**: do the true positives appear early in the ranking?
- **MAP@K**: average precision across positions where hits occur.

We only evaluate users who have **at least one positive item in the test set**.


In [152]:
def precision_at_k(recs, actual, k):
    return len(set(recs[:k]) & set(actual)) / k

def recall_at_k(recs, actual, k):
    return len(set(recs[:k]) & set(actual)) / len(actual) if actual else 0

def dcg_at_k(recs, actual, k):
    return sum((1 if r in actual else 0)/np.log2(i+2) for i, r in enumerate(recs[:k]))

def ndcg_at_k(recs, actual, k):
    idcg = sum(1/np.log2(i+2) for i in range(min(len(actual), k)))
    return dcg_at_k(recs, actual, k) / idcg if idcg else 0

def map_at_k(recs, actual, k):
    hits, sum_prec = 0, 0.0
    for i, r in enumerate(recs[:k]):
        if r in actual:
            hits += 1
            sum_prec += hits / (i + 1)
    return sum_prec / min(len(actual), k) if actual else 0


### Load and prepare the test set

We filter the test interactions (from `small_matrix.csv`) to users/items present in training, and keep only the positives (watch_ratio ≥ 2).


In [153]:
test_df = small_matrix.copy()
test_df = test_df[
    test_df['user_id'].isin(user2idx) &
    test_df['video_id'].isin(item2idx)
].copy()
test_df['interaction'] = (test_df['watch_ratio'] >= 2).astype(int)
test_df['u_idx'] = test_df['user_id'].map(user2idx)
test_df['i_idx'] = test_df['video_id'].map(item2idx)

ground_truth = test_df[test_df['interaction'] == 1].groupby('u_idx')['i_idx'].apply(set).to_dict()


## Evaluate the Autoencoder on Top-K Recommendation

We compute scores using the model, exclude already seen items, and keep the top-K per user.


In [154]:
K = 10
model.eval()
metrics = {'prec': [], 'rec': [], 'ndcg': [], 'map': []}

with torch.no_grad():
    for uidx, actual in ground_truth.items():
        input_vec = torch.FloatTensor(interaction_matrix[uidx]).unsqueeze(0)
        output_vec = model(input_vec).squeeze().numpy()

        seen = set(np.where(interaction_matrix[uidx] == 1)[0])
        output_vec[list(seen)] = -1

        top_k = np.argsort(output_vec)[-K:][::-1]

        metrics['prec'].append(precision_at_k(top_k, actual, K))
        metrics['rec'].append(recall_at_k(top_k, actual, K))
        metrics['ndcg'].append(ndcg_at_k(top_k, actual, K))
        metrics['map'].append(map_at_k(top_k, actual, K))

print("=== Deep Autoencoder @K=10 ===")
print(f"Precision@10: {np.mean(metrics['prec']):.4f}")
print(f"Recall@10   : {np.mean(metrics['rec']):.4f}")
print(f"NDCG@10     : {np.mean(metrics['ndcg']):.4f}")
print(f"MAP@10      : {np.mean(metrics['map']):.4f}")


=== Deep Autoencoder @K=10 ===
Precision@10: 0.5227
Recall@10   : 0.0915
NDCG@10     : 0.5586
MAP@10      : 0.3996
