In [2]:
import pandas as pd
import numpy as np
import ast
import json
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

class CoffeeRecommender:
    def __init__(self, recipes_path, users_path, train_path, cold_users_path):
        """
        Hybrid Recommender System optimized for NDCG@5.
        Strategies:
        - Cold Users: Content-Based Filtering (Taste Matching) + Popularity Boost
        - Warm Users: Weighted Hybrid of SVD (Collaborative) + History-Based Content Profile
        """
        # Load Data
        self.recipes = pd.read_csv(recipes_path)
        self.users = pd.read_csv(users_path)
        self.interactions = pd.read_csv(train_path)

        # Load Cold Users
        try:
            with open(cold_users_path, 'r') as f:
                data = json.load(f)
                # Handle both list and dict-like json
                self.cold_users = set(data if isinstance(data, list) else data[0])
        except:
            self.cold_users = set()

        self._preprocess_features()
        self._train_hybrid_model()

    def _preprocess_features(self):
        # 1. Parse Equipment and Products safely
        def parse_set(x, is_dict=False):
            try:
                val = ast.literal_eval(x)
                if is_dict: return set(val.keys())
                return set(val)
            except: return set()

        self.recipes['required_equipment_set'] = self.recipes['required_equipment'].apply(lambda x: parse_set(x))
        self.recipes['required_products_set'] = self.recipes['required_products'].apply(lambda x: parse_set(x, True))
        self.users['owned_equipment_set'] = self.users['owned_equipment'].apply(lambda x: parse_set(x))
        self.users['available_products_set'] = self.users['available_products'].apply(lambda x: parse_set(x))

        # 2. Taste Features Matrix
        self.taste_features = ['bitterness', 'sweetness', 'acidity', 'body']
        self.recipe_taste_matrix = self.recipes[['taste_' + f for f in self.taste_features]].values

        # 3. Indexing
        self.users.set_index('user_id', inplace=True, drop=False)

        # 4. Global Stats
        self.global_mean = self.interactions['rating'].mean()
        self.recipe_popularity = self.interactions.groupby('recipe_id')['rating'].mean().to_dict()

    def _train_hybrid_model(self):
        # --- A. Collaborative Filtering (SVD) ---
        # Use only explicit ratings for cleaner signal
        train_data = self.interactions.dropna(subset=['rating'])

        # Calculate Biases
        user_means = train_data.groupby('user_id')['rating'].mean()
        item_means = train_data.groupby('recipe_id')['rating'].mean()

        self.user_bias = (user_means - self.global_mean).to_dict()
        self.item_bias = (item_means - self.global_mean).to_dict()

        # Calculate Residuals
        train_data['residual'] = (train_data['rating'] - self.global_mean -
                                  train_data['user_id'].map(self.user_bias) -
                                  train_data['recipe_id'].map(self.item_bias))

        # Factorization
        self.rating_matrix = train_data.pivot_table(
            index='user_id', columns='recipe_id', values='residual'
        ).fillna(0)

        self.svd = TruncatedSVD(n_components=20, random_state=42)
        self.user_factors = self.svd.fit_transform(self.rating_matrix)
        self.item_factors = self.svd.components_

        # Mappings
        self.user_id_map = {uid: i for i, uid in enumerate(self.rating_matrix.index)}
        self.recipe_ids_cf = self.rating_matrix.columns.tolist()
        self.cf_recipe_indices = {rid: i for i, rid in enumerate(self.recipe_ids_cf)}

        # --- B. History-Based Content Profiling ---
        # Instead of trusting sign-up preferences, build profile from what they actually liked (>= 4.0)
        good_interactions = train_data[train_data['rating'] >= 4.0]
        merged = good_interactions.merge(self.recipes, on='recipe_id')
        self.user_history_profiles = merged.groupby('user_id')[
            ['taste_' + f for f in self.taste_features]
        ].mean()

    def _get_content_scores(self, user_id, feasible_ids):
        # 1. Determine User Taste Vector
        if user_id in self.user_history_profiles.index:
            # Warm: Use learned history profile
            u_vec = self.user_history_profiles.loc[user_id].values.reshape(1, -1)
        else:
            # Cold: Use stated preference
            try:
                u_vec = self.users.loc[user_id, ['taste_pref_' + f for f in self.taste_features]].values.reshape(1, -1)
            except:
                return {} # User not found

        # 2. Get Recipe Vectors for Feasible Items
        # Map feasible IDs to their index in the main recipe dataframe
        recipe_indices = [self.recipes[self.recipes['recipe_id'] == rid].index[0] for rid in feasible_ids]
        r_vecs = self.recipe_taste_matrix[recipe_indices]

        # 3. Calculate Similarity & Scale to Rating (1-5)
        sims = cosine_similarity(u_vec, r_vecs)[0]
        # Map cosine (-1 to 1) to approx rating (1 to 5)
        # Formula: 1 + 2 * (sim + 1) -> -1=>1, 0=>3, 1=>5
        scores = 1 + 2 * (sims + 1)

        return dict(zip(feasible_ids, scores))

    def recommend(self, user_id, n_recommendations=5):
        # --- 1. Feasibility Filter ---
        try:
            user_row = self.users.loc[user_id]
        except KeyError:
            return []

        u_equip = user_row['owned_equipment_set']
        u_prods = user_row['available_products_set']

        # Find feasible recipes
        # Note: Iterate is fast enough for <1000 recipes. Vectorize for larger catalogs.
        feasible_ids = []
        for idx, row in self.recipes.iterrows():
            if row['required_equipment_set'].issubset(u_equip) and row['required_products_set'].issubset(u_prods):
                feasible_ids.append(row['recipe_id'])

        if not feasible_ids:
            return []

        # --- 2. Scoring ---
        scores = {}
        is_warm = user_id in self.user_id_map

        # Base Content Score (Calculated for everyone)
        content_scores = self._get_content_scores(user_id, feasible_ids)

        if is_warm:
            # --- Warm Strategy: Hybrid (SVD + Content) ---
            u_idx = self.user_id_map[user_id]
            pred_residuals = np.dot(self.user_factors[u_idx], self.item_factors)
            u_bias = self.user_bias.get(user_id, 0)

            # Weighting: 60% Collaborative, 40% Content
            alpha = 0.6

            for rid in feasible_ids:
                # Calculate SVD component
                if rid in self.cf_recipe_indices:
                    idx = self.cf_recipe_indices[rid]
                    svd_val = self.global_mean + u_bias + self.item_bias.get(rid, 0) + pred_residuals[idx]
                else:
                    # New item not in matrix
                    svd_val = self.global_mean + u_bias + self.item_bias.get(rid, 0)

                # Blend
                c_val = content_scores.get(rid, 3.0)
                scores[rid] = alpha * svd_val + (1 - alpha) * c_val

        else:
            # --- Cold Strategy: Content + Popularity Boost ---
            for rid in feasible_ids:
                c_val = content_scores.get(rid, 3.0)
                pop_val = self.recipe_popularity.get(rid, 3.0) # Default to 3.0 (neutral)

                # Add small boost for popular items (0.1 weight)
                # Normalize pop_val (1-5) roughly to 0-1 for the boost magnitude
                scores[rid] = c_val + 0.1 * (pop_val / 5.0)

        # --- 3. Ranking ---
        sorted_recs = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        top_ids = [x[0] for x in sorted_recs[:n_recommendations]]

        # Return formatted list
        return top_ids

# --- Evaluation Script ---
def calculate_ndcg(recommender, val_df, k=5):
    """Calculates average NDCG@k for the validation set."""
    ndcg_scores = []

    # Group ground truth by user
    gt = val_df.groupby('user_id').apply(
        lambda x: dict(zip(x['recipe_id'], x['rating']))
    ).to_dict()

    for uid, ratings in gt.items():
        # Clean ratings (ignore NaNs)
        ratings = {k: v for k, v in ratings.items() if pd.notnull(v)}
        if not ratings: continue

        # Get Recommendations
        recs = recommender.recommend(uid, n_recommendations=k)
        if not recs:
            ndcg_scores.append(0)
            continue

        # Relevance vector
        rel = [ratings.get(rid, 0) for rid in recs]

        # Ideal Relevance (Top K ratings from ground truth)
        ideal = sorted(ratings.values(), reverse=True)[:k]

        # DCG & IDCG
        dcg = sum([r / np.log2(i+2) for i, r in enumerate(rel)])
        idcg = sum([r / np.log2(i+2) for i, r in enumerate(ideal)])

        if idcg > 0:
            ndcg_scores.append(dcg / idcg)
        else:
            ndcg_scores.append(0)

    return np.mean(ndcg_scores)

# --- Execution ---
if __name__ == "__main__":
    # Initialize
    rec_engine = CoffeeRecommender('recipes.csv', 'users.csv', 'interactions_train.csv', 'cold_users.json')

    # Load Validation Sets
    val_warm = pd.read_csv('interactions_val.csv')
    val_cold = pd.read_csv('interactions_val_cold.csv')

    print("--- Evaluation Results ---")

    # Warm Evaluation
    # We use a sample of 5000 for speed, remove .head() for full eval
    score_warm = calculate_ndcg(rec_engine, val_warm.head(5000), k=5)
    print(f"Warm Users NDCG@5: {score_warm:.4f}")

    # Cold Evaluation
    score_cold = calculate_ndcg(rec_engine, val_cold.head(5000), k=5)
    print(f"Cold Users NDCG@5: {score_cold:.4f}")

--- Evaluation Results ---
Warm Users NDCG@5: 0.3079
Cold Users NDCG@5: 0.3539


In [1]:
import pandas as pd
import numpy as np
import ast
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

class TwoTowerNCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=32):
        super(TwoTowerNCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

        self.user_tower = nn.Sequential(
            nn.Linear(embedding_dim + 4, 64),
            nn.ReLU(),
            nn.Linear(64, 32)
        )

        self.item_tower = nn.Sequential(
            nn.Linear(embedding_dim + 4, 64),
            nn.ReLU(),
            nn.Linear(64, 32)
        )

    def forward(self, user_ids, user_features, item_ids, item_features):
        u_emb = self.user_embedding(user_ids)
        u_input = torch.cat([u_emb, user_features], dim=1)
        u_vec = self.user_tower(u_input)

        i_emb = self.item_embedding(item_ids)
        i_input = torch.cat([i_emb, item_features], dim=1)
        i_vec = self.item_tower(i_input)

        return torch.sum(u_vec * i_vec, dim=1)

class DeepCoffeeRecommender:
    def __init__(self, recipes_path, users_path, train_path, cold_users_path):
        self.recipes = pd.read_csv(recipes_path)
        self.users = pd.read_csv(users_path)
        self.interactions = pd.read_csv(train_path)

        self._prepare_data()
        self._build_model()

    def _prepare_data(self):
        for df, col, is_dict in [(self.recipes, 'required_equipment', False),
                                 (self.recipes, 'required_products', True),
                                 (self.users, 'owned_equipment', False),
                                 (self.users, 'available_products', False)]:
            def parse(x):
                try:
                    v = ast.literal_eval(x)
                    return set(v.keys()) if is_dict else set(v)
                except: return set()
            df[f'{col}_set'] = df[col].apply(parse)

        self.user_map = {uid: i for i, uid in enumerate(self.users['user_id'].unique())}
        self.item_map = {rid: i for i, rid in enumerate(self.recipes['recipe_id'].unique())}
        self.inv_item_map = {i: rid for rid, i in self.item_map.items()}

        self.taste_cols = ['bitterness', 'sweetness', 'acidity', 'body']
        self.u_taste_cols = ['taste_pref_' + c for c in self.taste_cols]
        self.i_taste_cols = ['taste_' + c for c in self.taste_cols]

    def _build_model(self):
        self.model = TwoTowerNCF(len(self.user_map), len(self.item_map))
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001)

    def train_step(self, epochs=5):
        train_df = self.interactions.dropna(subset=['rating'])
        self.model.train()

        for epoch in range(epochs):
            u_ids = torch.tensor([self.user_map[uid] for uid in train_df['user_id']], dtype=torch.long)
            i_ids = torch.tensor([self.item_map[rid] for rid in train_df['recipe_id']], dtype=torch.long)

            u_feats = torch.tensor(self.users.set_index('user_id').loc[train_df['user_id'], self.u_taste_cols].values, dtype=torch.float)
            i_feats = torch.tensor(self.recipes.set_index('recipe_id').loc[train_df['recipe_id'], self.i_taste_cols].values, dtype=torch.float)
            ratings = torch.tensor(train_df['rating'].values, dtype=torch.float)

            self.optimizer.zero_grad()
            preds = self.model(u_ids, u_feats, i_ids, i_feats)
            loss = F.mse_loss(preds, ratings)
            loss.backward()
            self.optimizer.step()

    def recommend(self, user_id, n=5):
        self.model.eval()
        u_row = self.users[self.users['user_id'] == user_id].iloc[0]

        feasible = []
        for _, row in self.recipes.iterrows():
            if row['required_equipment_set'].issubset(u_row['owned_equipment_set']) and \
               row['required_products_set'].issubset(u_row['available_products_set']):
                feasible.append(row['recipe_id'])

        if not feasible: return []

        with torch.no_grad():
            u_idx = torch.tensor([self.user_map[user_id]], dtype=torch.long)
            u_feat = torch.tensor(u_row[self.u_taste_cols].values.reshape(1,-1), dtype=torch.float)

            i_idxs = torch.tensor([self.item_map[rid] for rid in feasible], dtype=torch.long)
            i_feats = torch.tensor(self.recipes.set_index('recipe_id').loc[feasible, self.i_taste_cols].values, dtype=torch.float)

            # Broadcast user to match feasible items
            u_idx_rep = u_idx.repeat(len(feasible))
            u_feat_rep = u_feat.repeat(len(feasible), 1)

            scores = self.model(u_idx_rep, u_feat_rep, i_idxs, i_feats)

        results = sorted(zip(feasible, scores.numpy()), key=lambda x: x[1], reverse=True)
        return [r[0] for r in results[:n]]

def evaluate_ndcg(rec_sys, val_df, k=5):
    scores = []
    gt = val_df.groupby('user_id').apply(lambda x: dict(zip(x['recipe_id'], x['rating']))).to_dict()

    for uid, ratings in gt.items():
        ratings = {k: v for k, v in ratings.items() if pd.notnull(v)}
        if not ratings: continue

        recs = rec_sys.recommend(uid, n=k)
        if not recs:
            scores.append(0); continue

        rel = [ratings.get(r, 0) for r in recs]
        ideal = sorted(ratings.values(), reverse=True)[:k]

        dcg = sum([r / np.log2(i+2) for i, r in enumerate(rel)])
        idcg = sum([r / np.log2(i+2) for i, r in enumerate(ideal)])
        scores.append(dcg/idcg if idcg > 0 else 0)
    return np.mean(scores)
