In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import os
from tqdm.notebook import tqdm
import xgboost as xgb

In [None]:
base_path = "/kaggle/input/kuairec-content-based"
interactions_big_path = os.path.join(base_path, "big_matrix.csv")
interactions_small_path = os.path.join(base_path, "small_matrix.csv")
item_features_path = os.path.join(base_path, "item_categories.csv")

interactions_big_df = pd.read_csv(interactions_big_path)
interactions_small_df = pd.read_csv(interactions_small_path)
item_features_df = pd.read_csv(item_features_path)

print(f"Interactions shape: {interactions_big_df.shape}")
print(f"Interactions small shape: {interactions_small_df.shape}")
print(f"Item features shape: {item_features_df.shape}")

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
interactions_big_df.drop(columns=["date", 'time'], inplace=True)
interactions_big_df.rename(columns={'user_id': 'userId', 'video_id': 'itemId'}, inplace=True)
interactions_small_df.drop(columns=["date", 'time'], inplace=True)
interactions_small_df.rename(columns={'user_id': 'userId', 'video_id': 'itemId'}, inplace=True)

item_features_df["feat"] = item_features_df["feat"].map(eval)
all_categories = [i for i in range(31)]

items_preproccesed = pd.DataFrame(index=item_features_df.index)

for category in sorted(all_categories):
    column_name = category
    items_preproccesed[column_name] = item_features_df['feat'].apply(lambda x: 1 if category in x else 0)
items_preproccesed['video_id'] = item_features_df["video_id"]
items_preproccesed.drop(columns=[14, 23, 27, 21, 0, 30, 22, 24, 29], inplace=True)
items_preproccesed.rename(columns={'video_id':'itemId'}, inplace=True)

items_preproccesed.set_index('itemId', inplace=True)

In [None]:
train_data = pd.merge(interactions_big_df, items_preproccesed, on='itemId', how='left')
test_data = pd.merge(interactions_small_df, items_preproccesed, on='itemId', how='left')

In [None]:
train_data = train_data[train_data['watch_ratio'] <= 5]
test_data = test_data[test_data['watch_ratio'] <= 5]

In [None]:
class LearningToRankRecommender:
    def __init__(self, item_features_df, params=None):
        self.item_features_df = item_features_df
        self.model = None
        self.user_profiles = {}
        
        self.params = {
            'objective': 'rank:ndcg',
            'learning_rate': 0.05,
            'max_depth': 6,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'min_child_weight': 1,
            'gamma': 0.1,
            'tree_method': 'hist',
            'n_estimators': 200
        }
        
        if params:
            self.params.update(params)
            
    def fit(self, train_data, feature_cols=None):
        """Train an XGBoost ranking model for personalized recommendations."""
        
        if feature_cols is None:
            feature_cols = train_data.select_dtypes(include=['int64', 'float64', 'bool']).columns.tolist()
            feature_cols = [col for col in feature_cols if col not in ['userId', 'itemId', 'watch_ratio']]
        
        print(f"Using features: {feature_cols}")
        
        train_data_copy = train_data.copy()
        
        print("Converting watch_ratio to discrete relevance levels...")
        
        bins = [0, 1, 2, 3, 4, 5]
        labels = [0, 1, 2, 3, 4]
        
        train_data_copy['relevance'] = pd.cut(
            train_data_copy['watch_ratio'], 
            bins=bins, 
            labels=labels, 
            include_lowest=True
        ).astype(int)
        
        print("Training XGBRanker model...")
        X = train_data_copy[feature_cols].values
        y = train_data_copy['relevance'].values
        
        train_data_copy['user_idx'] = train_data_copy['userId'].astype('category').cat.codes
        
        groups = train_data_copy.groupby('user_idx').size().values
        
        self.model = xgb.XGBRanker(
            objective=self.params['objective'],
            learning_rate=self.params['learning_rate'],
            max_depth=self.params['max_depth'],
            subsample=self.params['subsample'],
            colsample_bytree=self.params['colsample_bytree'],
            min_child_weight=self.params['min_child_weight'],
            gamma=self.params['gamma'],
            tree_method=self.params['tree_method'],
            n_estimators=self.params['n_estimators']
        )
        
        print(f"Training with {len(groups)} user groups...")
        self.model.fit(X, y, group=groups, verbose=True)
        
        print("Creating user profiles...")
        for user_id, group in train_data.groupby('userId'):
            self.user_profiles[user_id] = group[feature_cols].mean().values
            
        return self


    
    def predict(self, test_data, feature_cols=None):
        """Generate ranking scores for test data."""
        if feature_cols is None:
            feature_cols = self.item_features_df.columns.tolist()
            feature_cols = [col for col in feature_cols if col not in ['userId', 'itemId', 'watch_ratio']]
            
        X_test = test_data[feature_cols].values
        
        predictions = self.model.predict(X_test)
        return predictions
        
    def evaluate(self, test_data, feature_cols=None, k=10, relevance_threshold=0.8):
        """Evaluate the ranking model using multiple metrics."""
        if feature_cols is None:
            feature_cols = test_data.columns.tolist()
            feature_cols = [col for col in feature_cols if col not in ['userId', 'itemId', 'watch_ratio']]
        
        print("Generating predictions for evaluation...")
        X_test = test_data[feature_cols].values
        y_true = test_data['watch_ratio'].values
        
        test_data['prediction'] = self.model.predict(X_test)
        
        rmse = np.sqrt(mean_squared_error(y_true, test_data['prediction']))
        mae = np.mean(np.abs(y_true - test_data['prediction']))
        
        print(f"\nCalculating ranking metrics at k={k}...")
        
        precision = self._calculate_precision(test_data, k=k)
        
        hit_rate = self._calculate_hit_rate(test_data, k=k, relevance_threshold=relevance_threshold)
        
        mrr = self._calculate_mrr(test_data, k=k, relevance_threshold=relevance_threshold)
        
        ndcg = self._calculate_ndcg(test_data, k=k)
        
        plt.figure(figsize=(10, 6))
        plt.scatter(y_true, test_data['prediction'], alpha=0.3)
        plt.plot([0, 5], [0, 5], 'r--')
        plt.xlabel('Actual Watch Ratio')
        plt.ylabel('Predicted Ranking Score')
        plt.title('Actual vs Predicted Ranking Score')
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.show()
        
        print(f"RMSE: {rmse:.4f}")
        print(f"MAE: {mae:.4f}")
        print(f"Precision@{k}: {precision:.4f}")
        print(f"Hit Rate@{k}: {hit_rate:.4f}")
        print(f"MRR@{k}: {mrr:.4f}")
        print(f"NDCG@{k}: {ndcg:.4f}")
        
        return {
            "rmse": rmse,
            "mae": mae,
            "precision": precision,
            "hit_rate": hit_rate,
            "mrr": mrr,
            "ndcg": ndcg
        }
    
    def _calculate_precision(self, test_data, k=10):
        """Calculate Precision@k."""
        top_k_preds = (
            test_data.groupby('userId')
            .apply(lambda x: x.nlargest(k, 'prediction'))
            .reset_index(drop=True)
        )
        
        top_k_actual = (
            test_data.groupby('userId')
            .apply(lambda x: x.nlargest(k, 'watch_ratio'))
            .reset_index(drop=True)
        )
        
        joined_items = pd.merge(
            top_k_preds[['userId', 'itemId']],
            top_k_actual[['userId', 'itemId']],
            on=['userId', 'itemId'],
            how='inner'
        )
        
        precision = len(joined_items) / len(top_k_preds) if len(top_k_preds) > 0 else 0
        return precision

    def _calculate_hit_rate(self, test_data, k=10, relevance_threshold=0.8):
        """Calculate Hit Rate@k."""
        top_k_preds = (
            test_data.groupby('userId')
            .apply(lambda x: x.nlargest(k, 'prediction'))
            .reset_index(drop=True)[['userId', 'itemId']]
        )
        
        relevant_items = test_data[test_data['watch_ratio'] >= relevance_threshold][['userId', 'itemId']]
        
        users_with_relevant_items = relevant_items['userId'].unique()
        num_users_with_relevant_items = len(users_with_relevant_items)
        
        if num_users_with_relevant_items == 0:
            return 0.0
        
        hits = pd.merge(
            top_k_preds,
            relevant_items,
            on=['userId', 'itemId'],
            how='inner'
        )
        
        users_with_hits = hits['userId'].unique()
        users_with_hits_count = len(users_with_hits)
        
        hit_rate = users_with_hits_count / num_users_with_relevant_items
        return hit_rate

    def _calculate_mrr(self, test_data, k=10, relevance_threshold=0.8):
        """Calculate MRR@k."""
        ranked_preds = []
        for user_id, group in test_data.groupby('userId'):
            sorted_group = group.sort_values('prediction', ascending=False).head(k)
            sorted_group['rank'] = range(1, len(sorted_group) + 1)
            ranked_preds.append(sorted_group)
        
        if len(ranked_preds) == 0:
            return 0.0
        
        ranked_preds_df = pd.concat(ranked_preds, ignore_index=True)
        
        relevant_items = test_data[test_data['watch_ratio'] >= relevance_threshold][['userId', 'itemId']]
        
        users_with_relevant_items = relevant_items['userId'].unique()
        num_users_with_relevant_items = len(users_with_relevant_items)
        
        if num_users_with_relevant_items == 0:
            return 0.0
        
        hits_with_rank = pd.merge(
            ranked_preds_df[['userId', 'itemId', 'rank']],
            relevant_items,
            on=['userId', 'itemId'],
            how='inner'
        )
        
        if len(hits_with_rank) > 0:
            first_hit_per_user = hits_with_rank.groupby('userId')['rank'].min().reset_index()
            first_hit_per_user['reciprocal_rank'] = 1 / first_hit_per_user['rank']
            total_rr = first_hit_per_user['reciprocal_rank'].sum()
        else:
            total_rr = 0
        
        mrr = total_rr / num_users_with_relevant_items
        return mrr
    
    def _calculate_ndcg(self, test_data, k=10):
        """Calculate NDCG@k (Normalized Discounted Cumulative Gain)."""
        ndcg_scores = []
        
        for user_id, group in test_data.groupby('userId'):
            user_preds = group.sort_values('prediction', ascending=False).head(k)
            
            if len(user_preds) == 0:
                continue
                
            relevance_scores = user_preds['watch_ratio'].values
            
            dcg = relevance_scores[0]
            for i in range(1, len(relevance_scores)):
                dcg += relevance_scores[i] / np.log2(i + 1 + 1)
                
            ideal_relevance = np.sort(group['watch_ratio'].values)[::-1][:k]
            idcg = ideal_relevance[0]
            for i in range(1, len(ideal_relevance)):
                idcg += ideal_relevance[i] / np.log2(i + 1 + 1)
                
            ndcg = dcg / idcg if idcg > 0 else 0
            ndcg_scores.append(ndcg)
            
        return np.mean(ndcg_scores) if ndcg_scores else 0.0

In [None]:
train_data.drop(columns=['timestamp'], inplace=True)
#train_data.columns

In [None]:
ltr_recommender = LearningToRankRecommender(item_features_df)
ltr_recommender.fit(train_data, feature_cols=items_preproccesed.columns.tolist())

ltr_metrics = ltr_recommender.evaluate(
    test_data, 
    feature_cols=items_preproccesed.columns.tolist(),
    k=20,                   
    relevance_threshold=0.8
)

In [None]:
plt.figure(figsize=(12, 6))
xgb.plot_importance(ltr_recommender.model, max_num_features=15)
plt.title('Feature Importance')
plt.show()