In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import pandas as pd
import gzip
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [3]:
import pandas as pd
import gzip
# Load the dataset
file_path = '/content/drive/MyDrive/cmpe256_hotel_recommendation_system/data/processed/hotelrec_2013_2017_cleaned.csv.gz'

with gzip.open(file_path, 'rt') as f:
    df = pd.read_csv(f)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
class HotelDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X.values, dtype=torch.int64)  # Convert to tensor directly
        self.y = torch.tensor(y.values, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [28]:
class HotelNeuMF(nn.Module):
    def __init__(self, n_hotels, n_authors, embed_size=32):
        super().__init__()

        # Adding one extra embedding for the "unknown" token
        self.hotel_emb = nn.Embedding(n_hotels + 1, embed_size)  # +1 for "unknown" token
        self.author_emb = nn.Embedding(n_authors + 1, embed_size)  # +1 for "unknown" token

        # GMF block - classic matrix factorization (element-wise product)
        self.gmf = nn.Sequential(
            nn.Linear(embed_size, 1)  # After element-wise product, we reduce the output to a single scalar
        )

        # MLP block
        self.mlp = nn.Sequential(
            nn.Linear(embed_size * 2, 64),  # Removing service and sentiment_score, hence only embedding sizes
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

        # Final fusion layer
        self.final = nn.Sequential(
            nn.Linear(2, 1)
        )

    def forward(self, hotel_ids, author_ids):
        # Handle unseen IDs (map to "unknown" token, i.e., index 0)
        hotel_ids = torch.where(hotel_ids >= self.hotel_emb.num_embeddings,
                                 torch.tensor(0, device=hotel_ids.device), hotel_ids)
        author_ids = torch.where(author_ids >= self.author_emb.num_embeddings,
                                  torch.tensor(0, device=author_ids.device), author_ids)

        # Get embeddings for hotels and authors
        hotel_emb = self.hotel_emb(hotel_ids)
        author_emb = self.author_emb(author_ids)

        # Element-wise product for GMF
        gmf_vector = hotel_emb * author_emb  # Element-wise multiplication

        # Apply GMF block (this is just a linear transformation now)
        gmf_out = self.gmf(gmf_vector)

        # Prepare input for MLP (only embeddings)
        emb_cat = torch.cat([hotel_emb, author_emb], dim=1)
        mlp_out = self.mlp(emb_cat)

        # Combine the GMF and MLP outputs
        combined = torch.cat([gmf_out, mlp_out], dim=1)

        # Final prediction layer
        rating = self.final(combined)

        return rating.squeeze()  # Remove any extra dimensions

In [36]:
class HotelRecommendationAnalysis:
    def __init__(self, file_path, model_type="neumf", model_path=None):
        self.model_type = model_type.lower()
        self.model_path = model_path

        # Load gzipped CSV
        with gzip.open(file_path, 'rt') as f:
            self.data = pd.read_csv(f)

        # Label encoding
        self.label_encoder_author = LabelEncoder()
        self.label_encoder_hotel = LabelEncoder()
        self.data['author_id'] = self.label_encoder_author.fit_transform(self.data['author'])
        self.data['hotel_id'] = self.label_encoder_hotel.fit_transform(self.data['hotel_id'])

        self.n_authors = len(self.label_encoder_author.classes_)
        self.n_hotels = len(self.label_encoder_hotel.classes_)

        # Stratified split
        self.sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
        train_idx, test_idx = next(self.sss.split(self.data, self.data['rating']))
        self.train_data = self.data.iloc[train_idx]
        self.test_data = self.data.iloc[test_idx]

        # Split test_data into X and y
        feature_cols = ['hotel_id', 'author_id']
        X_test = self.test_data[feature_cols]
        y_test = self.test_data['rating']

        # Create datasets for test
        self.test_ds = HotelDataset(X_test, y_test)
        # Create DataLoaders
        self.test_dl = DataLoader(self.test_ds, batch_size=4000)

        # Load model
        self._load_model()

        self.results = {}

    def _load_model(self):
        if self.model_type == "neumf":
            self.model = HotelNeuMF(self.n_hotels, self.n_authors).to(device)
            if self.model_path:
                self.model.load_state_dict(torch.load('/content/drive/MyDrive/cmpe256_hotel_recommendation_system/models/neumf_model.pt'))
            self.model.eval()
        else:
            raise ValueError(f"Unsupported model type: {self.model_type}")

    def neumf_evaluation(self):
        self.model.eval()
        predictions, true_values = [], []

        with torch.no_grad():
            for X, y in self.test_dl:
                X, y = X.to(device), y.to(device)

                # Unpack features
                hotel_ids, author_ids = X[:, 0], X[:, 1]

                # Get predictions from the model
                preds = self.model(hotel_ids, author_ids).squeeze()

                predictions.extend(preds.cpu().numpy())
                true_values.extend(y.cpu().numpy())

        rmse = np.sqrt(mean_squared_error(true_values, predictions))
        mae = mean_absolute_error(true_values, predictions)
        self.results['NeuMF'] = {'RMSE': rmse, 'MAE': mae}
        print(f"NeuMF Evaluation: RMSE = {rmse:.4f}, MAE = {mae:.4f}")
        return self.results

In [37]:
# Pass the model class instead of instance; instance will be created inside
analysis = HotelRecommendationAnalysis(
    file_path="/content/drive/MyDrive/cmpe256_hotel_recommendation_system/data/processed/hotelrec_2013_2017_cleaned.csv.gz",
    model_type="neumf",
    model_path="/content/drive/MyDrive/cmpe256_hotel_recommendation_system/models/neumf_model.pt"
)

# Get evaluation results
results = analysis.neumf_evaluation()

NeuMF Evaluation: RMSE = 0.9542, MAE = 0.7135
