<a href="https://colab.research.google.com/github/RFA863/VAE_models_for_reconstruct_dataset/blob/main/VAE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from scipy.sparse import csr_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 1. Load and preprocess data
interactions = pd.read_excel(
    '/content/drive/My Drive/dataset/Tourism_Dataset/Transaction.xlsx',
    engine='openpyxl'
)
interactions.rename(columns={'UserId': 'user_id', 'AttractionId': 'item_id'}, inplace=True)
users = interactions['user_id'].unique()
items = interactions['item_id'].unique()
user2idx = {u: i for i, u in enumerate(users)}
item2idx = {it: i for i, it in enumerate(items)}
interactions['u_idx'] = interactions['user_id'].map(user2idx)
interactions['i_idx'] = interactions['item_id'].map(item2idx)
n_users = len(users)
n_items = len(items)
ratings = interactions['Rating'].astype(np.float32).values
row = interactions['u_idx'].values
col = interactions['i_idx'].values
sparse_ui = csr_matrix((ratings, (row, col)), shape=(n_users, n_items))

In [5]:
# 2. Dataset & DataLoader
def to_dense(idx, sparse):
    vec = sparse[idx].toarray().squeeze().astype(np.float32)
    max_rating = sparse.data.max() if sparse.data.size>0 else 1.0
    return vec / max_rating

class InteractionDataset(Dataset):
    def __init__(self, sparse_matrix):
        self.sparse = sparse_matrix.tocsr()
    def __len__(self):
        return self.sparse.shape[0]
    def __getitem__(self, idx):
        return to_dense(idx, self.sparse)

dataset = InteractionDataset(sparse_ui)
train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_ds, val_ds, test_ds = random_split(dataset, [train_size, val_size, test_size])
train_loader = DataLoader(train_ds, batch_size=128, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=128)
test_loader = DataLoader(test_ds, batch_size=128)


In [6]:
# 3. VAE definition
class VAE(nn.Module):
    def __init__(self, n_items, hidden_dim=600, latent_dim=200):
        super().__init__()
        self.enc_fc1 = nn.Linear(n_items, hidden_dim)
        self.enc_mu = nn.Linear(hidden_dim, latent_dim)
        self.enc_logvar = nn.Linear(hidden_dim, latent_dim)
        self.dec_fc1 = nn.Linear(latent_dim, hidden_dim)
        self.dec_out = nn.Linear(hidden_dim, n_items)
    def encode(self, x):
        h = F.relu(self.enc_fc1(x))
        return self.enc_mu(h), self.enc_logvar(h)
    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std
    def decode(self, z):
        h = F.relu(self.dec_fc1(z))
        return torch.sigmoid(self.dec_out(h))
    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

In [7]:
# 4. Loss: MSE + KL
def loss_fn(recon_x, x, mu, logvar, beta=1.0):
    mse = F.mse_loss(recon_x, x, reduction='sum')
    kld = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return mse + beta * kld

In [8]:
# 5. Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = VAE(n_items).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
n_epochs = 30
for epoch in range(1, n_epochs + 1):
    model.train()
    train_loss = 0.0
    beta = min(1.0, epoch / 10)
    for batch in train_loader:
        x = batch.to(device)
        optimizer.zero_grad()
        recon, mu, logvar = model(x)
        loss = loss_fn(recon, x, mu, logvar, beta)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    avg_train = train_loss / train_size
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            x = batch.to(device)
            recon, mu, logvar = model(x)
            val_loss += loss_fn(recon, x, mu, logvar, beta).item()
    avg_val = val_loss / val_size
    print(f"Epoch {epoch:02d}: Train Loss={avg_train:.4f}, Val Loss={avg_val:.4f}")

Epoch 01: Train Loss=0.2833, Val Loss=0.0066
Epoch 02: Train Loss=0.0040, Val Loss=0.0027
Epoch 03: Train Loss=0.0023, Val Loss=0.0020
Epoch 04: Train Loss=0.0019, Val Loss=0.0017
Epoch 05: Train Loss=0.0017, Val Loss=0.0016
Epoch 06: Train Loss=0.0016, Val Loss=0.0015
Epoch 07: Train Loss=0.0016, Val Loss=0.0015
Epoch 08: Train Loss=0.0016, Val Loss=0.0015
Epoch 09: Train Loss=0.0016, Val Loss=0.0014
Epoch 10: Train Loss=0.0016, Val Loss=0.0014
Epoch 11: Train Loss=0.0015, Val Loss=0.0014
Epoch 12: Train Loss=0.0015, Val Loss=0.0015
Epoch 13: Train Loss=0.0015, Val Loss=0.0014
Epoch 14: Train Loss=0.0015, Val Loss=0.0015
Epoch 15: Train Loss=0.0015, Val Loss=0.0014
Epoch 16: Train Loss=0.0015, Val Loss=0.0014
Epoch 17: Train Loss=0.0015, Val Loss=0.0014
Epoch 18: Train Loss=0.0015, Val Loss=0.0014
Epoch 19: Train Loss=0.0015, Val Loss=0.0014
Epoch 20: Train Loss=0.0015, Val Loss=0.0014
Epoch 21: Train Loss=0.0015, Val Loss=0.0014
Epoch 22: Train Loss=0.0015, Val Loss=0.0014
Epoch 23: 

In [9]:
# 6. Save model and user embeddings
torch.save(model.state_dict(), 'vae_tourism_cf.pth')
model.eval()
embeddings = []
for batch in DataLoader(dataset, batch_size=128):
    batch = batch.to(device)
    mu, _ = model.encode(batch)
    embeddings.append(mu.detach().cpu().numpy())
user_latent = np.vstack(embeddings)
np.save('user_latent.npy', user_latent)

In [10]:
# 7. Reconstruct full matrix and export to Excel
recon_data = []
model.eval()
with torch.no_grad():
    for batch in DataLoader(dataset, batch_size=128):
        recon_batch, _, _ = model(batch.to(device))
        recon_data.append(recon_batch.cpu().numpy())
dense_ui_hat = np.vstack(recon_data)
max_rating = interactions['Rating'].max()
dense_ui_hat = dense_ui_hat * max_rating
df_hat = pd.DataFrame(dense_ui_hat, index=users, columns=items)
df_hat.to_excel('reconstructed_transactions.xlsx')
print("Reconstructed matrix saved to 'reconstructed_transactions.xlsx'.")


Reconstructed matrix saved to 'reconstructed_transactions.xlsx'.


In [11]:
# 8. Evaluation on test set
all_preds = []
all_truths = []
with torch.no_grad():
    for batch in test_loader:
        x = batch.to(device)
        recon, _, _ = model(x)
        recon = recon.cpu().numpy()
        x_true = x.cpu().numpy()
        all_preds.append(recon)
        all_truths.append(x_true)

y_true = np.vstack(all_truths) * max_rating
y_pred = np.vstack(all_preds) * max_rating
rmse = np.sqrt(mean_squared_error(y_true.flatten(), y_pred.flatten()))
mae = mean_absolute_error(y_true.flatten(), y_pred.flatten())
print(f"\nTest RMSE: {rmse:.4f}, MAE: {mae:.4f}")


Test RMSE: 0.0363, MAE: 0.0116
