In [16]:
import numpy as np

In [17]:
ratings = np.array([
    [0, 1, 2, 2, 5, 0, 4, 3, 5], 
    [1, 5, 3, 0, 2, 3, 4, 3, 0], 
    [1, 1, 2, 0, 2, 4, 4, 5, 0], 
    [3, 2, 2, 3, 0, 1, 3, 2, 0], 
    [5, 1, 5, 5, 4, 4, 5, 2, 0], 
])


In [20]:
ratings[1][5]

np.int64(3)

---


# Toy Data


In [1]:
import numpy as np

In [None]:
ratings = np.array([
    [0, 1, 2, 2, 5, 0, 4, 3, 5], 
    [1, 5, 3, 0, 2, 3, 4, 3, 0], 
    [1, 1, 2, 0, 2, 4, 4, 5, 0], 
    [3, 2, 2, 3, 0, 1, 3, 2, 0], 
    [5, 1, 5, 5, 4, 4, 5, 2, 0], 
])


def compute_priors(ratings, alpha=0.01, R=5):
    num_users = len(ratings)
    num_items = len(ratings[0])
    rating_values = list(range(1, R + 1))

    prior_userbased = [[0 for _ in range(num_items)] for _ in rating_values]
    prior_itembased = [[0 for _ in range(num_users)] for _ in rating_values]

    for y in rating_values:
        y_index = y - 1

        # Prior user-based (per item j)
        for j in range(num_items):
            count_y = 0
            count_nonzero = 0
            for u in range(num_users):
                r = ratings[u][j]
                if r != 0:
                    count_nonzero += 1
                    if r == y:
                        count_y += 1
            prior_userbased[y_index][j] = (count_y + alpha) / (count_nonzero + alpha * R)

        # Prior item-based (per user u)
        for u in range(num_users):
            count_y = 0
            count_nonzero = 0
            for j in range(num_items):
                r = ratings[u][j]
                if r != 0:
                    count_nonzero += 1
                    if r == y:
                        count_y += 1
            prior_itembased[y_index][u] = (count_y + alpha) / (count_nonzero + alpha * R)

    return prior_userbased, prior_itembased



In [None]:
prior_userbased, prior_itembased = compute_priors(ratings)
prior_userbased

[[0.4962962962962963,
  0.596039603960396,
  0.0019801980198019802,
  0.0032786885245901644,
  0.0024691358024691358,
  0.24938271604938272,
  0.0019801980198019802,
  0.0019801980198019802,
  0.009523809523809523],
 [0.0024691358024691358,
  0.2,
  0.596039603960396,
  0.3311475409836066,
  0.4962962962962963,
  0.0024691358024691358,
  0.0019801980198019802,
  0.398019801980198,
  0.009523809523809523],
 [0.24938271604938272,
  0.0019801980198019802,
  0.2,
  0.3311475409836066,
  0.0024691358024691358,
  0.24938271604938272,
  0.2,
  0.398019801980198,
  0.009523809523809523],
 [0.0024691358024691358,
  0.0019801980198019802,
  0.0019801980198019802,
  0.0032786885245901644,
  0.24938271604938272,
  0.4962962962962963,
  0.596039603960396,
  0.0019801980198019802,
  0.009523809523809523],
 [0.24938271604938272,
  0.2,
  0.2,
  0.3311475409836066,
  0.24938271604938272,
  0.0024691358024691358,
  0.2,
  0.2,
  0.9619047619047618]]

In [4]:
def compute_likelihood_userbased(ratings, u, i, y, alpha=0.01, R=5):
    num_users = len(ratings)
    num_items = len(ratings[0])
    Iu = [j for j in range(num_items) if j != i and ratings[u][j] != 0]
    product = 1.0

    for j in Iu:
        k = ratings[u][j]
        count_joint = 0
        count_cond = 0
        for v in range(num_users):
            if ratings[v][i] == y:
                if ratings[v][j] != 0:
                    count_cond += 1
                    if ratings[v][j] == k:
                        count_joint += 1
        prob = (count_joint + alpha) / (count_cond + alpha * R)
        print(prob)
        product *= prob
    print("======")
    print(product, end="\n\n")

    return product


In [5]:
def compute_likelihood_itembased(ratings, u, i, y, alpha=0.01, R=5):
    num_users = len(ratings)
    num_items = len(ratings[0])
    Ui = [v for v in range(num_users) if v != u and ratings[v][i] != 0]
    product = 1.0

    for v in Ui:
        k = ratings[v][i]
        count_joint = 0
        count_cond = 0
        for j in range(num_items):
            if ratings[u][j] == y:
                if ratings[v][j] != 0:
                    count_cond += 1
                    if ratings[v][j] == k:
                        count_joint += 1
        prob = (count_joint + alpha) / (count_cond + alpha * R)
        product *= prob

    return product


In [6]:
# for y in range(1, 5 + 1):
#     y_index = y - 1

#     likelihood_user = compute_likelihood_userbased(ratings, 0, 0, y)
    
#     # likelihood_item = compute_likelihood_itembased(ratings, 1, 1, y)

In [None]:
def predict_rating(ratings, u, i, prior_userbased, prior_itembased, alpha=0.01, R=5, mode='hybrid'):
    scores = []
    all_likelihood_user = []  
    all_likelihood_item = []
    all_combined = []  

    for y in range(1, R + 1):
        y_index = y - 1
        prior_user = prior_userbased[y_index][i]
        prior_item = prior_itembased[y_index][u]

        likelihood_user = compute_likelihood_userbased(ratings, u, i, y, alpha, R)
        likelihood_item = compute_likelihood_itembased(ratings, u, i, y, alpha, R)

        all_likelihood_user.append(likelihood_user)
        all_likelihood_item.append(likelihood_item)

        if mode == 'user':
            score = prior_user * likelihood_user
        elif mode == 'item':
            score = prior_item * likelihood_item
        else:  # hybrid
            len_Iu = sum(1 for j in range(len(ratings[0])) if ratings[u][j] != 0 and j != i)
            len_Ui = sum(1 for v in range(len(ratings)) if v != u and ratings[v][i] != 0)

            score_user = (prior_user * likelihood_user) ** (1 / (1 + len_Iu)) if len_Iu > 0 else 0
            score_item = (prior_item * likelihood_item) ** (1 / (1 + len_Ui)) if len_Ui > 0 else 0
            score = score_user * score_item

        scores.append(score)
        all_combined.append(score)

    predicted_rating = scores.index(max(scores)) + 1

    return predicted_rating, {
        'scores': scores,
        'likelihood_user': all_likelihood_user,
        'likelihood_item': all_likelihood_item,
        'combined_score': all_combined
    }


In [11]:
predicted_rating, details = predict_rating(
    ratings, u=0, i=0,
    prior_userbased=prior_userbased,
    prior_itembased=prior_itembased,
    mode='hybrid'
)

0.49268292682926834
0.49268292682926834
0.19999999999999998
0.004878048780487806
0.9804878048780488
0.49268292682926834
0.19999999999999998
2.287972840400663e-05

0.19999999999999998
0.19999999999999998
0.19999999999999998
0.19999999999999998
0.19999999999999998
0.19999999999999998
0.19999999999999998
1.2799999999999996e-05

0.009523809523809523
0.9619047619047618
0.009523809523809523
0.19999999999999998
0.009523809523809523
0.009523809523809523
0.19999999999999998
3.165445712532573e-10

0.19999999999999998
0.19999999999999998
0.19999999999999998
0.19999999999999998
0.19999999999999998
0.19999999999999998
0.19999999999999998
1.2799999999999996e-05

0.9619047619047618
0.009523809523809523
0.009523809523809523
0.009523809523809523
0.009523809523809523
0.009523809523809523
0.19999999999999998
1.5073551012059868e-11



In [13]:
details['likelihood_user']

[2.287972840400663e-05,
 1.2799999999999996e-05,
 3.165445712532573e-10,
 1.2799999999999996e-05,
 1.5073551012059868e-11]

In [15]:
details['scores']

[0.00993203575284235,
 0.012072493453803068,
 0.0008942061751064822,
 0.011980444453581008,
 0.0012893744155991365]

---


# Test On Dataset

In [1]:
def compute_priors(ratings, alpha=0.01, R=5):
    num_users = len(ratings)
    num_items = len(ratings[0])
    rating_values = list(range(1, R + 1))

    prior_userbased = [[0 for _ in range(num_items)] for _ in rating_values]
    prior_itembased = [[0 for _ in range(num_users)] for _ in rating_values]

    for y in rating_values:
        y_index = y - 1

        # Prior user-based (per item j)
        for j in range(num_items):
            count_y = 0
            count_nonzero = 0
            for u in range(num_users):
                r = ratings[u][j]
                if r != 0:
                    count_nonzero += 1
                    if r == y:
                        count_y += 1
            prior_userbased[y_index][j] = (count_y + alpha) / (count_nonzero + alpha * R)

        # Prior item-based (per user u)
        for u in range(num_users):
            count_y = 0
            count_nonzero = 0
            for j in range(num_items):
                r = ratings[u][j]
                if r != 0:
                    count_nonzero += 1
                    if r == y:
                        count_y += 1
            prior_itembased[y_index][u] = (count_y + alpha) / (count_nonzero + alpha * R)

    return prior_userbased, prior_itembased

In [4]:
def compute_likelihood_userbased(ratings, u, i, y, alpha=0.01, R=5):
    num_users = len(ratings)
    num_items = len(ratings[0])
    Iu = [j for j in range(num_items) if j != i and ratings[u][j] != 0]
    product = 1.0

    for j in Iu:
        k = ratings[u][j]
        count_joint = 0
        count_cond = 0
        for v in range(num_users):
            if ratings[v][i] == y:
                if ratings[v][j] != 0:
                    count_cond += 1
                    if ratings[v][j] == k:
                        count_joint += 1
        prob = (count_joint + alpha) / (count_cond + alpha * R)
        # print(prob)
        product *= prob
    # print("======")
    # print(product, end="\n\n")

    return product


In [5]:
def compute_likelihood_itembased(ratings, u, i, y, alpha=0.01, R=5):
    num_users = len(ratings)
    num_items = len(ratings[0])
    Ui = [v for v in range(num_users) if v != u and ratings[v][i] != 0]
    product = 1.0

    for v in Ui:
        k = ratings[v][i]
        count_joint = 0
        count_cond = 0
        for j in range(num_items):
            if ratings[u][j] == y:
                if ratings[v][j] != 0:
                    count_cond += 1
                    if ratings[v][j] == k:
                        count_joint += 1
        prob = (count_joint + alpha) / (count_cond + alpha * R)
        product *= prob

    return product


In [6]:
def predict_rating(ratings, u, i, prior_userbased, prior_itembased, alpha=0.01, R=5, mode='hybrid'):
    scores = []
    all_likelihood_user = []  
    all_likelihood_item = []
    all_combined = []  

    for y in range(1, R + 1):
        y_index = y - 1
        prior_user = prior_userbased[y_index][i]
        prior_item = prior_itembased[y_index][u]

        likelihood_user = compute_likelihood_userbased(ratings, u, i, y, alpha, R)
        likelihood_item = compute_likelihood_itembased(ratings, u, i, y, alpha, R)

        all_likelihood_user.append(likelihood_user)
        all_likelihood_item.append(likelihood_item)

        if mode == 'user':
            score = prior_user * likelihood_user
        elif mode == 'item':
            score = prior_item * likelihood_item
        else:  # hybrid
            len_Iu = sum(1 for j in range(len(ratings[0])) if ratings[u][j] != 0 and j != i)
            len_Ui = sum(1 for v in range(len(ratings)) if v != u and ratings[v][i] != 0)

            score_user = (prior_user * likelihood_user) ** (1 / (1 + len_Iu)) if len_Iu > 0 else 0
            score_item = (prior_item * likelihood_item) ** (1 / (1 + len_Ui)) if len_Ui > 0 else 0
            score = score_user * score_item

        scores.append(score)
        all_combined.append(score)

    predicted_rating = scores.index(max(scores)) + 1

    return predicted_rating, {
        'scores': scores,
        'likelihood_user': all_likelihood_user,
        'likelihood_item': all_likelihood_item,
        'combined_score': all_combined
    }


# Train Test Split


In [7]:
import pandas as pd
import numpy as np

In [9]:


def load_movielens_1m(path="./ml-1m/ratings.dat"):
    df = pd.read_csv(path, sep='::', engine='python', names=['user', 'item', 'rating', 'timestamp'])

    num_users = df['user'].nunique()
    num_items = df['item'].nunique()

    user_map = {uid: idx for idx, uid in enumerate(df['user'].unique())}
    item_map = {iid: idx for idx, iid in enumerate(df['item'].unique())}

    ratings = np.zeros((num_users, num_items), dtype=int)
    for _, row in df.iterrows():
        u = user_map[row['user']]
        i = item_map[row['item']]
        ratings[u][i] = int(row['rating'])

    return ratings, user_map, item_map


In [10]:
def train_test_split_matrix(ratings, test_ratio=0.1, seed=42):
    np.random.seed(seed)
    train = ratings.copy()
    test = []

    for u in range(ratings.shape[0]):
        items_rated = np.where(ratings[u] > 0)[0]
        if len(items_rated) == 0:
            continue
        test_size = max(1, int(len(items_rated) * test_ratio))
        test_items = np.random.choice(items_rated, size=test_size, replace=False)
        for i in test_items:
            test.append((u, i, ratings[u][i]))  # simpan ground truth
            train[u][i] = 0  # kosongkan di train

    return train, test


In [11]:
ratings_full, user_map, item_map = load_movielens_1m("./ml-1m/ratings.dat")
ratings_train, test_set = train_test_split_matrix(ratings_full, test_ratio=0.1)



In [12]:
ratings_full

array([[5, 3, 3, ..., 0, 0, 0],
       [5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0]], shape=(6040, 3706))

In [12]:
ratings_train

array([[5, 3, 3, ..., 0, 0, 0],
       [5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0]], shape=(6040, 3706))

In [13]:
test_set

[(0, np.int64(19), np.int64(4)),
 (0, np.int64(41), np.int64(5)),
 (0, np.int64(47), np.int64(4)),
 (0, np.int64(12), np.int64(4)),
 (0, np.int64(43), np.int64(4)),
 (1, np.int64(162), np.int64(3)),
 (1, np.int64(88), np.int64(4)),
 (1, np.int64(136), np.int64(4)),
 (1, np.int64(106), np.int64(5)),
 (1, np.int64(160), np.int64(3)),
 (1, np.int64(66), np.int64(3)),
 (1, np.int64(87), np.int64(5)),
 (1, np.int64(61), np.int64(3)),
 (1, np.int64(159), np.int64(4)),
 (1, np.int64(154), np.int64(3)),
 (1, np.int64(170), np.int64(3)),
 (1, np.int64(128), np.int64(5)),
 (2, np.int64(104), np.int64(4)),
 (2, np.int64(200), np.int64(4)),
 (2, np.int64(58), np.int64(3)),
 (2, np.int64(196), np.int64(5)),
 (2, np.int64(124), np.int64(4)),
 (3, np.int64(208), np.int64(4)),
 (3, np.int64(63), np.int64(5)),
 (4, np.int64(256), np.int64(3)),
 (4, np.int64(195), np.int64(1)),
 (4, np.int64(226), np.int64(3)),
 (4, np.int64(9), np.int64(4)),
 (4, np.int64(310), np.int64(3)),
 (4, np.int64(59), np.int64

In [14]:

# Komputasi prior
prior_userbased, prior_itembased = compute_priors(ratings_train)

In [15]:
prior_userbased

[[0.005154274315498215,
  0.04843700663088096,
  0.012427976243240848,
  0.018620193731229646,
  0.014768460575719651,
  0.009026376391823555,
  0.007889142587197858,
  0.012528692458578523,
  0.020571176859261436,
  0.012934294302058759,
  0.02822214095397315,
  0.04146702159124519,
  0.008904008282798402,
  0.009139682660477152,
  0.04495349853317521,
  0.024989590339835367,
  0.0373336649670439,
  0.024406716067911078,
  0.006605716168355161,
  0.014018657129161905,
  0.017916215069977856,
  0.012368843352672508,
  0.007653896007309662,
  0.008621143582010962,
  0.07311077894037536,
  0.11240455265811841,
  0.015045729118653112,
  0.0658130968134187,
  0.016277195809830784,
  0.005058398453911436,
  0.02366354193058019,
  0.037708830548926014,
  0.0702728017796511,
  0.024013906709159392,
  0.036054498367300986,
  0.07889029714124772,
  0.06562884063908234,
  0.025210735393857184,
  0.006347839876758569,
  0.006247493427209126,
  0.008497651336217193,
  0.003389543975318034,
  0.010

In [None]:


# Prediksi dan evaluasi
from tqdm import tqdm

y_true = []
y_pred = []

for u, i, actual in tqdm(test_set):
    pred, _ = predict_rating(ratings_train, u, i, prior_userbased, prior_itembased, mode='hybrid')
    y_true.append(actual)
    y_pred.append(pred)

  0%|          | 7/97383 [00:30<103:32:27,  3.83s/it]

In [None]:
# from sklearn.metrics import mean_squared_error, mean_absolute_error
# import math

# rmse = math.sqrt(mean_squared_error(y_true, y_pred))
# mae = mean_absolute_error(y_true, y_pred)

# print("RMSE:", rmse)
# print("MAE :", mae)
