# Expirements - Collaborative Filtering (e.g., ALS, Matrix Factorization)

The base model will be a simple collaborative filtering system based on the interractions matrix

Learns latent representations from the interaction matrix:

* **User Features**:
    - Past interactions history
    - User’s preferred tags, categories, or creators

* **Item Features**:
    - Hashtags
    - Categories
    - Creator ID
    - Video length
    - Caption sentiment or topic modeling on text

* **Interaction Features**:
    - Binary or implicit feedback (e.g., like, rewatch, watch completed)
    - Interaction strength: frequency of likes or total watch time per user-item pair

* **Temporal Features**: Not Needed

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from cycler import cycler
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Any
from sklearn.metrics.pairwise import cosine_similarity
import random
from collections import defaultdict
import math


# set plot size
plt.rcParams["figure.figsize"] = (20, 13)
colors = plt.get_cmap('tab10').colors
plt.rc('axes', prop_cycle=cycler('color', colors))
%matplotlib inline
%config InlineBackend.figure_format = "retina"

### Base Model - User Based Collaborative Filtering

The base model will simply get all interactions from all users (that has watched the video) and recommend the video that their similar user watched

User's similarity will be computed from interaction matrix and be used to weighten recommended videos

In [2]:
interactions_train = pd.read_csv("data_final_project/KuaiRec 2.0/data/big_matrix.csv")
interactions_train

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio
0,0,3649,13838,10867,2020-07-05 00:08:23.438,20200705,1.593879e+09,1.273397
1,0,9598,13665,10984,2020-07-05 00:13:41.297,20200705,1.593879e+09,1.244082
2,0,5262,851,7908,2020-07-05 00:16:06.687,20200705,1.593879e+09,0.107613
3,0,1963,862,9590,2020-07-05 00:20:26.792,20200705,1.593880e+09,0.089885
4,0,8234,858,11000,2020-07-05 00:43:05.128,20200705,1.593881e+09,0.078000
...,...,...,...,...,...,...,...,...
12530801,7175,1281,34618,140017,2020-09-05 15:07:10.576,20200905,1.599290e+09,0.247241
12530802,7175,3407,12619,21888,2020-09-05 15:08:45.228,20200905,1.599290e+09,0.576526
12530803,7175,10360,2407,7067,2020-09-05 19:10:29.041,20200905,1.599304e+09,0.340597
12530804,7175,10360,6455,7067,2020-09-05 19:10:36.995,20200905,1.599304e+09,0.913400


### Preprocessing

In [3]:
def preprocess(data):
    data.drop_duplicates(['user_id', 'video_id'], keep='first', inplace=True)
    data.drop(columns=['play_duration', 'video_duration', 'time', 'date', 'timestamp'])
    return data.pivot(index='user_id', columns='video_id', values='watch_ratio').fillna(0)

data_preprocessed = preprocess(interactions_train)

In [4]:
data_preprocessed

video_id,0,1,2,3,4,5,6,7,8,9,...,10718,10719,10720,10721,10722,10723,10724,10725,10726,10727
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.037248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.452116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7172,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7174,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Model

In [5]:
class CollaborativeFiltering:
    def __init__(self):
        self.user_similarity = None
        self.user_item_matrix = None

    def fit(self, data):
        self.user_item_matrix = data
        self.user_similarity = pd.DataFrame(
            cosine_similarity(self.user_item_matrix),
            index=self.user_item_matrix.index,
            columns=self.user_item_matrix.index
        )
        print(self.user_similarity.iloc[0])

    def predict(self, user_id, top_n=5):
        if user_id not in self.user_item_matrix.index:
            return []

        similar_users = self.user_similarity[user_id].drop(user_id).sort_values(ascending=False)

        weighted_scores = pd.Series(dtype=np.float64)
        for sim_user, sim_score in similar_users.items():
            user_videos = self.user_item_matrix.loc[sim_user]
            weighted_scores = weighted_scores.add(user_videos * sim_score, fill_value=0)

        pred = weighted_scores.sort_values(ascending=False).head(top_n).index.tolist()
        return pred

In [6]:
cf_model = CollaborativeFiltering()
cf_model.fit(data_preprocessed)
recommendations = cf_model.predict(user_id=0, top_n=10)
print(recommendations)

user_id
0       1.000000
1       0.085002
2       0.078711
3       0.167374
4       0.057428
          ...   
7171    0.060846
7172    0.114287
7173    0.020428
7174    0.121861
7175    0.047943
Name: 0, Length: 7176, dtype: float64
[314, 7383, 8298, 211, 3338, 8366, 3344, 3211, 10500, 2629]


### Evaluation

In [23]:
interactions_test = pd.read_csv("data_final_project/KuaiRec 2.0/data/small_matrix.csv")
interactions_test

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio
0,14,148,4381,6067,2020-07-05 05:27:48.378,20200705.0,1.593898e+09,0.722103
1,14,183,11635,6100,2020-07-05 05:28:00.057,20200705.0,1.593898e+09,1.907377
2,14,3649,22422,10867,2020-07-05 05:29:09.479,20200705.0,1.593898e+09,2.063311
3,14,5262,4479,7908,2020-07-05 05:30:43.285,20200705.0,1.593898e+09,0.566388
4,14,8234,4602,11000,2020-07-05 05:35:43.459,20200705.0,1.593899e+09,0.418364
...,...,...,...,...,...,...,...,...
4676565,7162,2267,11908,5467,,,,2.178160
4676566,7162,2065,11919,6067,,,,1.964562
4676567,7162,1296,16690,19870,,,,0.839960
4676568,7162,4822,11862,24400,,,,0.486148


In [24]:
def build_ground_truth(pivot_matrix: pd.DataFrame):
    ground_truth = defaultdict(list)
    for user_id, row in pivot_matrix.iterrows():
        watched_videos = row[row > 0].index.tolist()
        ground_truth[user_id] = watched_videos
    return dict(ground_truth)

In [33]:
test_data_preprocessed = preprocess(interactions_test)
test_ground_truth = build_ground_truth(test_data_preprocessed)
ground_truth = build_ground_truth(data_preprocessed)

In [38]:
K = 1000
N_USERS = 20

random.seed(42)

###################

sample_user_ids = random.sample(list(data_preprocessed.index), min(N_USERS, len(data_preprocessed)))
sample_recommendations = {}
for user_id in sample_user_ids:
    sample_recommendations[user_id] = cf_model.predict(user_id=user_id, top_n=K)

sample_ground_truth = {user: ground_truth.get(user, []) for user in sample_user_ids}

###################

test_sample_user_ids = random.sample(list(data_preprocessed.index), min(N_USERS, len(data_preprocessed)))
test_sample_recommendations = {}
for user_id in test_sample_user_ids:
    test_sample_recommendations[user_id] = cf_model.predict(user_id=user_id, top_n=K)

test_sample_ground_truth = {user: test_ground_truth.get(user, []) for user in sample_user_ids}

#### NDCG@K

In [39]:
from metrics import ndcg

ndcg_score = ndcg(recommendations=sample_recommendations, ground_truth=sample_ground_truth, k=K)
test_ndcg_score = ndcg(recommendations=test_sample_recommendations, ground_truth=test_sample_ground_truth, k=K)

print(f"Train NDCG@{K} ({N_USERS} users): {ndcg_score:.4f}")
print(f"Test NDCG@{K} ({N_USERS} users): {test_ndcg_score:.4f}")

Train NDCG@1000 (20 users): 0.5889
Test NDCG@1000 (20 users): 0.0000


#### Precision@K

In [40]:
from metrics import precision_at_k

precision_score = precision_at_k(recommendations=sample_recommendations, ground_truth=sample_ground_truth, k=K)
test_precision_score = precision_at_k(recommendations=test_sample_recommendations, ground_truth=test_sample_ground_truth, k=K)

print(f"Train Precision@{K} ({N_USERS} users): {precision_score:.4f}")
print(f"Test Precision@{K} ({N_USERS} users): {test_precision_score:.4f}")

Train Precision@1000 (20 users): 0.5268
Test Precision@1000 (20 users): 0.0000


#### Novelty

In [None]:
item_popularity = data_preprocessed[data_preprocessed > 1e-3].count(axis=0) / len(data_preprocessed)

In [None]:
from metrics import novelty

novelty_score = novelty(sample_recommendations, item_popularity)
test_novelty_score = novelty(test_sample_recommendations, item_popularity)

print(f"Train Novelty: {novelty_score:.4f}")
print(f"Test Novelty: {test_novelty_score:.4f}")