In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import os
from tqdm.notebook import tqdm

In [2]:
# Define paths to data files
base_path = "../data_final_project/KuaiRec 2.0/data"
interactions_big_path = os.path.join(base_path, "big_matrix.csv")
interactions_small_path = os.path.join(base_path, "small_matrix.csv")
item_features_path = os.path.join(base_path, "item_categories.csv")

interactions_big_df = pd.read_csv(interactions_big_path)
interactions_small_df = pd.read_csv(interactions_small_path)
item_features_df = pd.read_csv(item_features_path)

print(f"Interactions shape: {interactions_big_df.shape}")
print(f"Interactions small shape: {interactions_small_df.shape}")
print(f"Item features shape: {item_features_df.shape}")

Interactions shape: (12530806, 8)
Interactions small shape: (4676570, 8)
Item features shape: (10728, 2)


In [3]:
interactions_big_df.drop(columns=["date", 'time'], inplace=True)
interactions_big_df.rename(columns={'user_id': 'userId', 'video_id': 'itemId'}, inplace=True)
interactions_small_df.drop(columns=["date", 'time'], inplace=True)
interactions_small_df.rename(columns={'user_id': 'userId', 'video_id': 'itemId'}, inplace=True)

In [None]:
item_features_df["feat"] = item_features_df["feat"].map(eval)
all_categories = [i for i in range(31)]

items_preproccesed = pd.DataFrame(index=item_features_df.index)

for category in sorted(all_categories):
    column_name = category
    items_preproccesed[column_name] = item_features_df['feat'].apply(lambda x: 1 if category in x else 0)
items_preproccesed.drop(columns=[14, 23, 27, 21, 0, 30, 22, 24, 29], inplace=True)
items_preproccesed.rename(columns={'video_id':'itemId'}, inplace=True)
items_preproccesed.set_index('itemId', inplace=True)

In [None]:
class ContentBasedRecommender:
    def __init__(self, item_features_df, params=None):
        self.item_features_df = item_features_df
        self.model = None
        self.user_profiles = {}
        
        # Default XGBoost parameters
        self.params = {
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'eta': 0.1,
            'max_depth': 6,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'min_child_weight': 1
        }
        
        if params:
            self.params.update(params)
            
    def fit(self, interactions_df, feature_cols=None):
        """Train an XGBoost model for content-based recommendations."""
        print("Preparing training data...")
        train_data = self.prepare_features(interactions_df)
        
        if feature_cols is None:
            feature_cols = train_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
            feature_cols = [col for col in feature_cols if col not in ['userId', 'itemId', 'watch_ratio']]
        
        print(f"Using features: {feature_cols}")
        
        print("Training XGBoost model...")
        X = train_data[feature_cols].values
        y = train_data['rating'].values
        
        # Create user-specific indices to enable personalized training
        train_data['user_idx'] = train_data['userId'].astype('category').cat.codes
        
        # Create DMatrix for XGBoost
        dtrain = xgb.DMatrix(X, label=y)
        dtrain.set_group(train_data.groupby('user_idx').size().values)
        
        # Train the model
        self.model = xgb.train(self.params, dtrain, num_boost_round=10)
        
        print("Creating user profiles...")
        # Create user profiles (average features of items each user has interacted with)
        for user_id, group in train_data.groupby('userId'):
            self.user_profiles[user_id] = group[feature_cols].mean().values
            
        return self
    """
    def recommend(self, user_id, n=10, filter_viewed=True):
        if user_id not in self.user_profiles:
            print(f"User {user_id} not found in training data")
            return []
        
        # Get all items
        items = self.item_features_df.copy()
        
        # Get feature columns
        feature_cols = [col for col in items.columns 
                       if col not in ['userId', 'itemId'] and items[col].dtype in ['int64', 'float64']]
        
        # Add user profile features to each item
        user_profile = np.tile(self.user_profiles[user_id], (len(items), 1))
        item_features = items[feature_cols].values
        
        # Combine user profile with item features
        features = np.hstack([user_profile, item_features])
        
        # Make predictions
        dtest = xgb.DMatrix(features)
        items['score'] = self.model.predict(dtest)
        
        # Filter out items the user has already interacted with
        if filter_viewed:
            viewed_items = interactions_df[interactions_df['user_id'] == user_id]['click_history']
            viewed_items = [item for sublist in viewed_items for item in sublist]  # Flatten list
            items = items[~items['itemId'].isin(viewed_items)]
        
        # Sort by score and return top N
        top_items = items.sort_values('score', ascending=False).head(n)
        return top_items[['itemId', 'score']]
    """

In [None]:
# Initialize and train the recommender
recommender = ContentBasedRecommender(item_features_df)
recommender.fit(train_data, feature_cols=items_preproccesed.columns.tolist())