In [1]:
import numpy as np
import math
from math import prod
from decimal import Decimal, getcontext
getcontext().prec = 5

---


# Functions


## Compute User & item prior


In [None]:
def compute_priors(ratings,plausible_rating, alpha=1):
    num_users = len(ratings) # banyak user
    num_items = len(ratings[0]) # banyak item
    rating_values = plausible_rating # biar dinamis sesuai dataset. plausible rating di ambil di parameter
    R = len(rating_values)


    # buat tempat kosong biar diisi prior nanti 
    #prior userbased itu y plausible x num item
    prior_userbased = [[0 for _ in range(num_items)] for _ in rating_values] 
    
    # prior itembased itu y plausible x num user 
    prior_itembased = [[0 for _ in range(num_users)] for _ in rating_values]



    y_index = 0
    
    # loop sebanyak plausible
    for y in rating_values:
        # Prior user-based (per item j)
        for j in range(num_items):
            count_y = 0
            count_nonzero = 0
            for u in range(num_users):
                r = ratings[u][j]   
                if r != 0:
                    count_nonzero += 1
                    if r == y:
                        count_y += 1
            prior_userbased[y_index][j] = (count_y + alpha) / (count_nonzero + alpha * R)

        # Prior item-based (per user u)
        for u in range(num_users):
            count_y = 0
            count_nonzero = 0
            for j in range(num_items):
                r = ratings[u][j]
                if r != 0:
                    count_nonzero += 1
                    if r == y:
                        count_y += 1
            prior_itembased[y_index][u] = (count_y + alpha) / (count_nonzero + alpha * R)
        y_index = y_index + 1
        
        
    # dapet matrix hasil userbased dan itembased
    return prior_userbased, prior_itembased

## Compute likelihood User


In [None]:
def compute_likelihood_userbased(ratings, u, i, y, alpha=0.01, R=8):
    num_users = len(ratings)
    num_items = len(ratings[0])
    
    # loop sebanyak item yang di rating saja oleh user u, karena kalau ratingnya 0 sudah pasti tidak masuk. jadi untuk apa di loop di semua
    Iu = [j for j in range(num_items) if j != i and ratings[u][j] != 0]
    
    # Precompute user yang Rui = y saja, karna untuk apa di loop semua 
    users_with_y = [v for v in range(num_users) if ratings[v][i] == y] # agar komputasi tidak selalu loop setiap user item, dicari saja dulu yang cocok, nanti loopnya berdasarkan  data disini
    
    product = Decimal(1.0)
    
    for j in Iu:
        k = ratings[u][j]
        count_joint = 0
        count_cond = 0
        
        # Only iterate through users who rated item i as y
        for v in users_with_y:
            if ratings[v][j] != 0:
                count_cond += 1
                if ratings[v][j] == k:
                    count_joint += 1
        
        prob = (count_joint + alpha) / (count_cond + alpha * R)
        product *= Decimal(prob)
    
    return product

## Compute likelihood Item


In [None]:
def compute_likelihood_itembased(ratings, u, i, y, alpha=0.01, R=8):
    num_users = len(ratings)
    num_items = len(ratings[0])
    
    
     # loop sebanyak user yang merating item i saja, karena kalau ratingnya 0 sudah pasti tidak masuk. jadi untuk apa di loop di semua
    Ui = [v for v in range(num_users) if v != u and ratings[v][i] != 0]
    
    # Precompute item yang Rvi = y saja, karna untuk apa di loop semua 
    items_with_y = [j for j in range(num_items) if ratings[u][j] == y] # agar komputasi tidak selalu loop setiap user item, dicari saja dulu yang cocok, nanti loopnya berdasarkan  data disini
    
    product = Decimal(1.0)
    
    for v in Ui:
        k = ratings[v][i]
        count_joint = 0
        count_cond = 0
        
        # Only iterate through items rated as y by user u
        for j in items_with_y:
            if ratings[v][j] != 0:
                count_cond += 1
                if ratings[v][j] == k:
                    count_joint += 1
        
        prob = (count_joint + alpha) / (count_cond + alpha * R)
        product *= Decimal(prob)
    
    return product

# Predict Functions


In [None]:
def predict_rating(ratings, u, i, prior_userbased, prior_itembased,plausible_rating, alpha=1):
    scores = []
    all_likelihood_user = []  
    all_likelihood_item = []
    R = len(plausible_rating)  
    
    
    ## Len UI dan IU dihitung disini karena kemaren redundance, dihitungnya tiap loop y rating, makanya dibuat disini
    len_Iu = sum(1 for j in range(len(ratings[0])) if ratings[u][j] != 0) 
    len_Ui = sum(1 for v in range(len(ratings)) if ratings[v][i] != 0)


    # loop untuk sebanyak y plausible rating, karena mau mencari semua score kemungkinan rating nya
    y_index = 0
    for y in plausible_rating:
        prior_user = prior_userbased[y_index][i]
        prior_item = prior_itembased[y_index][u]
    

        # live hitung likellihood
        likelihood_user = compute_likelihood_userbased(ratings, u, i, y, alpha, R)
        likelihood_item = compute_likelihood_itembased(ratings, u, i, y, alpha, R)
        
        # simpan sebagai justifikasi
        all_likelihood_user.append(likelihood_user)
        all_likelihood_item.append(likelihood_item)
        
        
        
        
        score_item = (Decimal(prior_item) * likelihood_item) ** Decimal(1 / (1 + len_Ui)) if len_Ui > 0 else 0
        score_user = (Decimal(prior_user) * likelihood_user) ** Decimal(1 / (1 + len_Iu)) if len_Iu > 0 else 0


        # rumus hybrid disederhanakan (score user dan item sudah dipangkat ui dan iu)
        score = score_user * score_item

        # simpan semua score di list, untuk nanti diambil yang score paling tinggi
        scores.append(score)
        y_index += 1


    # index dengan score paling tinggi itu lah prediksi ratingnya , hasil data ratingnya berapa yaitu plausible rating index yang tertingi dari score 
    predicted_rating = plausible_rating[scores.index(max(scores))]

    
    return predicted_rating, {
        'scores': scores,
        'likelihood_user': all_likelihood_user,
        'likelihood_item': all_likelihood_item
    }


## Load Data Full


Untuk justifikasi tain test split


In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

In [None]:

# mau justifikasi nanti benar tidak pemetaan hasil matrixnya untuk training matrix
def load_filmtrust_full(path):
    df = pd.read_csv(path, sep=' ', engine='python', names=['user', 'item', 'rating'])

    num_users = df['user'].nunique()
    num_items = df['item'].nunique()

    user_map = {uid: idx for idx, uid in enumerate(df['user'].unique())}
    item_map = {iid: idx for idx, iid in enumerate(df['item'].unique())}

    ratings = np.zeros((num_users, num_items))
    for _, row in df.iterrows():
        u = user_map[row['user']]
        i = item_map[row['item']]
        ratings[u][i] = row['rating']

    return ratings

ratings_full = load_filmtrust_full("./film-trust/ratings.txt")


In [8]:
ratings_full

array([[2. , 4. , 3.5, ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 4. , 0. , ..., 0. , 0. , 0. ],
       [1.5, 3. , 2. , ..., 0. , 0. , 0. ]], shape=(1508, 2071))

# Train Test Split


In [None]:
full_df = pd.read_csv('./film-trust/ratings.txt', sep=' ', names=['user', 'item', 'rating'])
user_ids = full_df['user'].unique()
item_ids = full_df['item'].unique()

#mapping ulang, user 1 itu index 0, atau bisa dibilang user 0,untuk setiap user dan item dan seterusnya
user_map = {uid: idx for idx, uid in enumerate(user_ids)}
item_map = {iid: idx for idx, iid in enumerate(item_ids)}

In [None]:
user_map

In [None]:
item_map

jadi karena loop proses dimulai dari item 0 bukan satu, dibuat mappig agar konsisten antara train test dan data asli

jadi sebenarnya misalkan user 1 di mapping jadi index 0 / user 0 , agar terbaca di list, coba lihat di mapping, user 1 menjadi user 0

jadi pembagian test trainnya sebenarnya saat ada data
470 5 3.5

sebenarnya itu di data asli ratings.txt itu 471 6 3.5 , karena mapingnya dimulai dari nol maka menjadi 470 5 3.5


In [None]:

# Split dataset , pake pemetaan sebelumnya, yang user 1 jadi user 0
full_df['user_idx'] = full_df['user'].map(user_map)
full_df['item_idx'] = full_df['item'].map(item_map)

#pake library
train_df, test_df = train_test_split(full_df[['user_idx', 'item_idx', 'rating']], test_size=0.2, random_state=42)

#simpan
print(f"\nTrain set: {len(train_df)} ratings ({len(train_df)/len(full_df)*100:.1f}%)")
print(f"Test set: {len(test_df)} ratings ({len(test_df)/len(full_df)*100:.1f}%)")
train_df.to_csv('./film-trust/train.txt', sep=' ', header=False, index=False)
test_df.to_csv('./film-trust/test.txt', sep=' ', header=False, index=False)


Train set: 28397 ratings (80.0%)
Test set: 7100 ratings (20.0%)


coba justifikasi di test.txt dan train.txt
seharusnya data disana itu data asli yang user dan itemnya dari 0

jadi kalau mau cek
di data test.txt 470 5 3.5 itu sebenarnya 469 4 3.5 di data ratings.txt

## Get Plausible Ratings


In [None]:

# mau cari banyak y plausible ratingnya apa saja di dataset
temp_df = pd.read_csv("./film-trust/ratings.txt", sep=' ', engine='python', names=['rating'])
plausible_rating = temp_df['rating'].unique()
plausible_rating.sort()
plausible_rating

array([0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. ])

In [None]:
# data plausible nya
pR = plausible_rating
num_r = len(pR)

#R
num_r

## Load train and test data


In [None]:
# buat martix dari data train, dengan pemetaan sebelumnya, sebanyak user x item asli
def load_filmtrust_train_make_matrix(path, user_map, item_map):
    df = pd.read_csv(path, sep=' ', engine='python', names=['user_idx', 'item_idx', 'rating'])

    num_users = len(user_map)
    num_items = len(item_map)

    ratings = np.zeros((num_users, num_items))
    for _, row in df.iterrows():
        u = int(row['user_idx']) 
        i = int(row['item_idx']) 
        ratings[u][i] = row['rating']

    return ratings


In [13]:
ratings_train = load_filmtrust_train_make_matrix('./film-trust/train.txt', user_map, item_map)

In [14]:
ratings_train

array([[2. , 4. , 3.5, ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 4. , 0. , ..., 0. , 0. , 0. ],
       [1.5, 3. , 2. , ..., 0. , 0. , 0. ]], shape=(1508, 2071))

In [None]:
# mau ambil data test, untuk di prediksi semua yang ada disini, dan jadi groundtrut
def load_test_ratings(path):
    data = np.loadtxt(path, dtype={'names': ('u', 'i', 'r'), 'formats': (int, int, float)})
    test = np.array(
        [(int(row[0]), int(row[1]), float(row[2])) for row in data],
        dtype=[('u', int), ('i', int), ('r', float)]
    )
    return test


In [16]:
test_set = load_test_ratings("./film-trust/test.txt")

In [17]:
test_set

array([( 470,   5, 3.5), (1033, 902, 3. ), (1248, 256, 3. ), ...,
       (1082, 256, 2. ), ( 570,  11, 4. ), ( 525,   3, 3. )],
      shape=(7100,), dtype=[('u', '<i8'), ('i', '<i8'), ('r', '<f8')])

## Precompute User & Item Prior


In [19]:

prior_userbased, prior_itembased = compute_priors(ratings_train,pR)

In [None]:
# harusnya sebanyak #R x jumpah item
prior_userbased

[[0.020437956204379562,
  0.014925373134328358,
  0.021775544388609715,
  0.027829313543599257,
  0.014625228519195612,
  0.036011080332409975,
  0.01907032181168057,
  0.01415929203539823,
  0.019762845849802372,
  0.026533996683250415,
  0.018252933507170794,
  0.030303030303030304,
  0.012558869701726845,
  0.125,
  0.07692307692307693,
  0.08333333333333333,
  0.027417027417027416,
  0.125,
  0.1,
  0.06666666666666667,
  0.09090909090909091,
  0.1111111111111111,
  0.1,
  0.09090909090909091,
  0.1111111111111111,
  0.05263157894736842,
  0.1,
  0.14285714285714285,
  0.08333333333333333,
  0.1,
  0.05263157894736842,
  0.1111111111111111,
  0.1,
  0.1,
  0.1111111111111111,
  0.2,
  0.1111111111111111,
  0.09090909090909091,
  0.1,
  0.1111111111111111,
  0.1111111111111111,
  0.18181818181818182,
  0.08333333333333333,
  0.1,
  0.1,
  0.1,
  0.2,
  0.1,
  0.1,
  0.1,
  0.15384615384615385,
  0.08333333333333333,
  0.1,
  0.1,
  0.08333333333333333,
  0.09090909090909091,
  0.181

In [None]:
# harusnya sebanyak #R x jumpah user
prior_itembased

[[0.058823529411764705,
  0.1111111111111111,
  0.06976744186046512,
  0.15384615384615385,
  0.1111111111111111,
  0.09090909090909091,
  0.10526315789473684,
  0.1,
  0.1111111111111111,
  0.15384615384615385,
  0.09090909090909091,
  0.07692307692307693,
  0.2,
  0.2,
  0.125,
  0.058823529411764705,
  0.1111111111111111,
  0.125,
  0.058823529411764705,
  0.1,
  0.021739130434782608,
  0.058823529411764705,
  0.06666666666666667,
  0.1,
  0.04081632653061224,
  0.18181818181818182,
  0.07407407407407407,
  0.022222222222222223,
  0.17777777777777778,
  0.24,
  0.07692307692307693,
  0.058823529411764705,
  0.06666666666666667,
  0.043478260869565216,
  0.07692307692307693,
  0.03125,
  0.0196078431372549,
  0.20833333333333334,
  0.05,
  0.1111111111111111,
  0.07692307692307693,
  0.15384615384615385,
  0.07692307692307693,
  0.07692307692307693,
  0.07692307692307693,
  0.045454545454545456,
  0.07692307692307693,
  0.09090909090909091,
  0.09090909090909091,
  0.0263157894736842

Test prediksi tipis tipis

In [23]:
pred, _ = predict_rating(ratings_train, 470, 5, prior_userbased, prior_itembased,plausible_rating = pR)
_


{'scores': [Decimal('0.012455'),
  Decimal('0.015782'),
  Decimal('0.017744'),
  Decimal('0.014440'),
  Decimal('0.021397'),
  Decimal('0.021848'),
  Decimal('0.027206'),
  Decimal('0.023695')],
 'likelihood_user': [Decimal('2.7388E-12'),
  Decimal('5.9473E-11'),
  Decimal('1.2763E-10'),
  Decimal('5.1555E-12'),
  Decimal('2.4223E-10'),
  Decimal('2.1086E-10'),
  Decimal('4.5197E-9'),
  Decimal('3.6582E-10')],
 'likelihood_item': [Decimal('1.6185E-319'),
  Decimal('1.6185E-319'),
  Decimal('5.2122E-314'),
  Decimal('8.1038E-312'),
  Decimal('5.6482E-306'),
  Decimal('1.0411E-307'),
  Decimal('4.9001E-308'),
  Decimal('1.2198E-300')]}

In [24]:
pred

np.float64(3.5)

## Loop prediction for each data test


In [None]:


# Prediksi dan evaluasi
from tqdm import tqdm

# simpen data groundtruth di data test disini
y_true = []
#simpen data prediksi disini
y_pred = []

#loop untuk stiap data test
for u, i, actual in tqdm(test_set):
    pred, _ = predict_rating(ratings_train, u, i, prior_userbased, prior_itembased,plausible_rating= pR)
    y_true.append(actual)
    y_pred.append(pred)

100%|██████████| 7100/7100 [03:49<00:00, 30.98it/s]


Batch, belum justifikasi

In [None]:
# from math import ceil
# import numpy as np
# from tqdm import tqdm

# def predict_in_batches(test_set, ratings, prior_userbased, prior_itembased, pR, batch_size=1000):
#     y_true = []
#     y_pred = []
    
#     n_batches = ceil(len(test_set) / batch_size)
    
#     for batch_idx in tqdm(range(n_batches)):
#         start = batch_idx * batch_size
#         end = start + batch_size
#         batch = test_set[start:end]
        
#         for u, i, actual in batch:
#             pred, _ = predict_rating(ratings, u, i, prior_userbased, prior_itembased, pR)
#             y_true.append(actual)
#             y_pred.append(pred)
    
#     return y_true, y_pred

# # Gunakan dengan:
# y_true, y_pred = predict_in_batches(test_set, ratings_train, prior_userbased, prior_itembased, pR, batch_size=500)

## Export to evaluation


simpan untuk lebih mudah saja evaluasinya

In [26]:
import pandas as pd

df_results = pd.DataFrame({'y_true': y_true, 'y_pred': y_pred})
df_results.to_csv('./film-trust/predictions.csv', index=False)

In [27]:
df_test = pd.DataFrame(test_set)
df_test.to_csv('test_set.csv', index=False)

In [28]:

df_full = pd.DataFrame(ratings_full)
df_full.to_csv('ratings_full.csv', index=False)

df_train = pd.DataFrame(ratings_train)
df_train.to_csv('ratings_train.csv', index=False)

df_test = pd.DataFrame(test_set)
df_test.to_csv('test_set.csv', index=False)



In [29]:
df_prior_userbased = pd.DataFrame(prior_userbased)
df_prior_userbased.to_csv('prior_userbased.csv', index=False)

df_prior_itembased = pd.DataFrame(prior_itembased)
df_prior_itembased.to_csv('prior_itembased.csv', index=False)



---


In [30]:

y_pred = pd.read_csv('./film-trust/predictions.csv')['y_pred'].tolist()
y_true = pd.read_csv('./film-trust/predictions.csv')['y_true'].tolist()

## Using Library


In [31]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_true, y_pred)
print("MAE :", mae)


MAE : 0.7136619718309859


## Using Manual


### Mae Dari rumus PPT


In [32]:

total_error = 0
n = len(y_true)

for actual, pred in zip(y_true, y_pred):
    total_error += abs(actual - pred)

mae = total_error / n
print(f"MAE : {mae}")

MAE : 0.7136619718309859


### Mae Dari rumus Paper


In [33]:
user_data = {}

# proses penyusunan ulang rate predict & rate asli agar mudah diakses
for i in range(len(test_set)): # loop sebanyak panjang test set min (1508)
    user_id, _, actual = test_set[i] # var penampung user & rate asli
    # print(user_id,actual)
    predicted = y_pred[i] # var penampung predict rating
    if user_id not in user_data: # cek apakah user id U ada di user data agar menghindari error oengisian kosong
        user_data[user_id] = []
    
    user_data[user_id].append((actual, predicted)) # tambah rating asli $ predict ke user data


#penghitungan MAE
mae_total = 0 # MAE
for user_id in user_data: # akses setiap user pada user data
    total_abs_error = 0 # penampung nilai error mutlak/user
    ratings = user_data[user_id] # ambil nilai rating pred & actual
    
    for actual, predicted in ratings: # indexing nilai rating pred & actual setiap item pada user U
        total_abs_error += abs(actual - predicted) # r u,i (pred) - r u,i (actual)
        print(f"pred {predicted}, actual {actual}")
    
    #MAEu
    mae_user = total_abs_error / len(ratings) #hitung MAEu

    mae_total += mae_user # tamping MAEu ke MAE untuk proses perhitungan MAE nantinya
    print(f"User {user_id}: MAE = {mae_user} (dari {len(ratings)} item)")



pred 3.5, actual 3.5
pred 2.5, actual 2.5
pred 4.0, actual 2.0
pred 3.0, actual 2.0
User 470: MAE = 0.75 (dari 4 item)
pred 3.5, actual 3.0
pred 4.0, actual 4.0
pred 4.0, actual 4.0
pred 0.5, actual 3.0
pred 4.0, actual 2.5
pred 4.0, actual 3.0
pred 4.0, actual 3.5
pred 4.0, actual 4.0
pred 4.0, actual 2.5
pred 4.0, actual 3.0
pred 4.0, actual 4.0
pred 4.0, actual 4.0
pred 4.0, actual 4.0
pred 4.0, actual 3.5
pred 4.0, actual 3.0
User 1033: MAE = 0.6666666666666666 (dari 15 item)
pred 3.5, actual 3.0
pred 3.5, actual 3.0
pred 3.0, actual 3.0
pred 3.0, actual 3.5
pred 0.5, actual 1.5
pred 3.0, actual 3.0
pred 3.0, actual 3.5
pred 3.5, actual 3.0
pred 3.5, actual 3.5
pred 3.0, actual 1.0
pred 3.0, actual 3.5
pred 3.0, actual 4.0
pred 3.5, actual 0.5
pred 3.0, actual 3.5
pred 3.0, actual 2.5
pred 3.0, actual 4.0
pred 3.5, actual 3.0
pred 1.0, actual 2.0
pred 3.0, actual 1.5
pred 3.5, actual 3.0
pred 3.5, actual 4.0
pred 3.0, actual 1.0
pred 3.0, actual 1.5
pred 3.0, actual 4.0
pred 0.5, a

In [34]:
# Langkah 3: hitung MAE keseluruhan dari rata-rata MAE user
overall_mae = mae_total / len(user_data)
# print(y_pred)
print(f"MAE Tot : {overall_mae}")

MAE Tot : 0.718353240767965
