In [54]:
import numpy as np
import pandas as pd
from numpy.linalg import pinv


In [55]:
COLS = ["item_id","user_id","rating","timestamp"]
DATA_PATH = "../data/Movies_and_TV.csv"   

df = pd.read_csv(DATA_PATH, header=None, names=COLS)

print("df shape:", df.shape)
df.head()


df shape: (8765568, 4)


Unnamed: 0,item_id,user_id,rating,timestamp
0,1527665,A3478QRKQDOPQ2,5.0,1362960000
1,1527665,A2VHSG6TZHU1OB,5.0,1361145600
2,1527665,A23EJWOW1TLENE,5.0,1358380800
3,1527665,A1KM9FNEJ8Q171,5.0,1357776000
4,1527665,A38LY2SSHVHRYB,4.0,1356480000


In [57]:
TOP_K_USERS = 10000   

user_counts = df.groupby("user_id")["item_id"].count().sort_values(ascending=False)
top_users = set(user_counts.head(TOP_K_USERS).index)

print("Top users selected:", len(top_users))


Top users selected: 10000


In [58]:
df_users = df[df["user_id"].isin(top_users)].copy()

print("After user filtering:", df_users.shape)


After user filtering: (863176, 4)


In [59]:
I1 = "B00PCSVODW"
I2 = "B005GISDXW"

if I1 is None or I2 is None:
    counts_all = df_users.groupby("item_id")["user_id"].count().sort_values(ascending=False)
    I1 = counts_all.index[0]
    I2 = counts_all.index[1]

print("Target items:", I1, I2)


Target items: B00PCSVODW B005GISDXW


In [60]:
TOP_N_ITEMS = 1000   

item_counts = df_users.groupby("item_id")["user_id"].count().sort_values(ascending=False)
top_items = set(item_counts.head(TOP_N_ITEMS).index)

top_items.add(I1)
top_items.add(I2)

print("Final item count:", len(top_items))
print("I1 in sample?", I1 in top_items)
print("I2 in sample?", I2 in top_items)

df_sub = df_users[df_users["item_id"].isin(top_items)].copy()

print("Subset shape:", df_sub.shape)


Final item count: 1002
I1 in sample? True
I2 in sample? True
Subset shape: (199756, 4)


In [61]:
R = df_sub.pivot_table(
    index="user_id",
    columns="item_id",
    values="rating",
    aggfunc="mean"
)

print("R shape:", R.shape)
print("Users:", R.shape[0], "Items:", R.shape[1])
print("I1 exists?", I1 in R.columns)
print("I2 exists?", I2 in R.columns)


R shape: (9372, 1002)
Users: 9372 Items: 1002
I1 exists? True
I2 exists? True


## Cell 5 — Covariance Matrix (Pairwise MLE on overlaps only)

In [62]:
items = R.columns.to_list()
mu = R.mean(axis=0)

def pairwise_cov_mle(i, j):
    xi = R[i]
    xj = R[j]
    mask = xi.notna() & xj.notna()
    n = int(mask.sum())
    if n == 0:
        return 0.0
    a = xi[mask].values
    b = xj[mask].values
    return float(np.mean((a - a.mean()) * (b - b.mean())))

cov = pd.DataFrame(np.zeros((len(items), len(items))), index=items, columns=items)

for idx_i, i in enumerate(items):
    x = R[i].dropna()
    cov.loc[i, i] = float(np.mean((x - x.mean())**2)) if len(x) > 0 else 0.0
    for idx_j in range(idx_i + 1, len(items)):
        j = items[idx_j]
        c = pairwise_cov_mle(i, j)
        cov.loc[i, j] = c
        cov.loc[j, i] = c

print("Covariance matrix shape:", cov.shape)
cov.iloc[:5, :5]


Covariance matrix shape: (1002, 1002)


Unnamed: 0,076780192X,0767803434,0767805712,0767824571,0767827759
076780192X,0.986628,0.222222,-0.062222,-0.088435,-0.047337
0767803434,0.222222,0.928945,0.320988,-0.140625,0.416667
0767805712,-0.062222,0.320988,1.037721,0.905325,0.07438
0767824571,-0.088435,-0.140625,0.905325,0.7225,0.0
0767827759,-0.047337,0.416667,0.07438,0.0,1.562486


## Cell 6 — Select target items I1, I2

In [63]:
I1 = "B00PCSVODW"
I2 = "B005GISDXW"

if I1 is None or I2 is None:
    counts = R.notna().sum(axis=0).sort_values(ascending=False)
    I1 = counts.index[0]
    I2 = counts.index[1]

print("Target items:", I1, I2)
print("Counts:", int(R[I1].notna().sum()), int(R[I2].notna().sum()))


Target items: B00PCSVODW B005GISDXW
Counts: 6 5


## Cell 7 — Top-k peers

In [64]:
def top_k_peers(target_item, k=5):
    s = cov[target_item].drop(index=target_item).sort_values(ascending=False)
    return s.head(k)

top5_I1  = top_k_peers(I1, k=5)
top10_I1 = top_k_peers(I1, k=10)

top5_I2  = top_k_peers(I2, k=5)
top10_I2 = top_k_peers(I2, k=10)

print("Top5 peers for I1:\n", top5_I1)
print("\nTop10 peers for I1:\n", top10_I1)

print("\nTop5 peers for I2:\n", top5_I2)
print("\nTop10 peers for I2:\n", top10_I2)


Top5 peers for I1:
 B005ZCSP0K    0.5
076780192X    0.0
B004HO6HWK    0.0
B004BLJQOK    0.0
B004C03TK2    0.0
Name: B00PCSVODW, dtype: float64

Top10 peers for I1:
 B005ZCSP0K    0.5
076780192X    0.0
B004HO6HWK    0.0
B004BLJQOK    0.0
B004C03TK2    0.0
B004EPYZOY    0.0
B004EPYZP8    0.0
B004EPYZQ2    0.0
B004EPYZQC    0.0
B004EPYZTE    0.0
Name: B00PCSVODW, dtype: float64

Top5 peers for I2:
 076780192X    0.0
B004HO6HWK    0.0
B004BLJQOK    0.0
B004C03TK2    0.0
B004EPYZOY    0.0
Name: B005GISDXW, dtype: float64

Top10 peers for I2:
 076780192X    0.0
B004HO6HWK    0.0
B004BLJQOK    0.0
B004C03TK2    0.0
B004EPYZOY    0.0
B004EPYZP8    0.0
B004EPYZQ2    0.0
B004EPYZQC    0.0
B004EPYZTE    0.0
B004EPYZU8    0.0
Name: B005GISDXW, dtype: float64


## Cell 8 — Reduced space (Top-5)

In [65]:
def user_reduced_space(user_id, peer_items):
    v = R.loc[user_id, peer_items]
    return v - mu[peer_items]

sample_user = R.index[0]
peer5 = top5_I1.index.tolist()
print("Sample user:", sample_user)
print(user_reduced_space(sample_user, peer5))


Sample user: A100JCBNALJFAW
item_id
B005ZCSP0K   NaN
076780192X   NaN
B004HO6HWK   NaN
B004BLJQOK   NaN
B004C03TK2   NaN
dtype: float64


## Cell 9 — Predict missing ratings (Top-5)

In [66]:
def predict_rating(user_id, target_item, peer_items):
    r_peers = R.loc[user_id, peer_items]
    avail = r_peers.notna()
    used_peers = r_peers.index[avail].tolist()

    if len(used_peers) == 0:
        return float(mu[target_item])

    rP = r_peers[used_peers].values.astype(float)
    muP = mu[used_peers].values.astype(float)

    Sigma_iP = cov.loc[target_item, used_peers].values.reshape(1, -1)
    Sigma_PP = cov.loc[used_peers, used_peers].values

    pred = mu[target_item] + (Sigma_iP @ pinv(Sigma_PP) @ (rP - muP).reshape(-1, 1)).item()
    return float(pred)

def batch_predict(target_item, peer_items):
    missing_users = R.index[R[target_item].isna()].tolist()
    preds = [(u, predict_rating(u, target_item, peer_items)) for u in missing_users]
    return pd.DataFrame(preds, columns=["user_id", f"pred_{target_item}"])

pred_I1_top5 = batch_predict(I1, top5_I1.index.tolist())
pred_I2_top5 = batch_predict(I2, top5_I2.index.tolist())

pred_I1_top5.head(), pred_I2_top5.head()


(          user_id  pred_B00PCSVODW
 0  A100JCBNALJFAW              1.5
 1  A100RW34WSLTUW              1.5
 2  A100WFKYVRPVX7              1.5
 3  A100WO06OQR8BQ              1.5
 4  A10175AMUHOQC4              1.5,
           user_id  pred_B005GISDXW
 0  A100JCBNALJFAW              2.4
 1  A100RW34WSLTUW              2.4
 2  A100WFKYVRPVX7              2.4
 3  A100WO06OQR8BQ              2.4
 4  A10175AMUHOQC4              2.4)

## Cell 10 — Predict missing ratings (Top-10)

In [67]:
pred_I1_top10 = batch_predict(I1, top10_I1.index.tolist())
pred_I2_top10 = batch_predict(I2, top10_I2.index.tolist())

pred_I1_top10.head(), pred_I2_top10.head()


(          user_id  pred_B00PCSVODW
 0  A100JCBNALJFAW              1.5
 1  A100RW34WSLTUW              1.5
 2  A100WFKYVRPVX7              1.5
 3  A100WO06OQR8BQ              1.5
 4  A10175AMUHOQC4              1.5,
           user_id  pred_B005GISDXW
 0  A100JCBNALJFAW              2.4
 1  A100RW34WSLTUW              2.4
 2  A100WFKYVRPVX7              2.4
 3  A100WO06OQR8BQ              2.4
 4  A10175AMUHOQC4              2.4)

## Cell 11 — Compare Top-5 vs Top-10

In [68]:
def compare_top5_top10(target_item, pred5, pred10):
    m = pred5.merge(pred10, on="user_id", how="inner", suffixes=("_top5","_top10"))
    m["abs_diff"] = (m[f"pred_{target_item}_top5"] - m[f"pred_{target_item}_top10"]).abs()
    return m.sort_values("abs_diff", ascending=False)

cmp_I1 = compare_top5_top10(I1, pred_I1_top5, pred_I1_top10)
cmp_I2 = compare_top5_top10(I2, pred_I2_top5, pred_I2_top10)

print("I1 biggest diffs:")
display(cmp_I1.head(10))

print("I2 biggest diffs:")
display(cmp_I2.head(10))

print("Mean abs diff I1:", float(cmp_I1["abs_diff"].mean()) if len(cmp_I1) else 0.0)
print("Mean abs diff I2:", float(cmp_I2["abs_diff"].mean()) if len(cmp_I2) else 0.0)


I1 biggest diffs:


Unnamed: 0,user_id,pred_B00PCSVODW_top5,pred_B00PCSVODW_top10,abs_diff
2572,A22RY8N8CNDF3A,0.935622,3.080595,2.144974
4407,A2TXH9QKLD4ZVX,1.343536,0.733441,0.610094
2258,A1XT8AJB7S9JJG,1.692656,1.232204,0.460452
9048,AV6QDP8Q0ONK4,1.655819,2.064797,0.408978
872,A1CLHLW9PFKG9Q,1.659186,1.325656,0.333529
5174,A34D06JL7LC6MU,1.932321,1.665848,0.266473
1921,A1SHLQKJSPCCNZ,1.343536,1.116838,0.226697
7680,AAZRWLML88IZK,1.659186,1.439648,0.219538
6737,A3QH6BEY6RYQR0,1.619662,1.838219,0.218557
9138,AWG2O9C42XW5G,0.70357,0.494172,0.209398


I2 biggest diffs:


Unnamed: 0,user_id,pred_B005GISDXW_top5,pred_B005GISDXW_top10,abs_diff
0,A100JCBNALJFAW,2.4,2.4,0.0
6257,A3JYJ907WWREJH,2.4,2.4,0.0
6241,A3JPFWKS83R49V,2.4,2.4,0.0
6242,A3JPPR6JT75N0E,2.4,2.4,0.0
6243,A3JSDTPWSYFW23,2.4,2.4,0.0
6244,A3JSO0N085OQXU,2.4,2.4,0.0
6245,A3JSROIZ1SFTS,2.4,2.4,0.0
6246,A3JTA7SAV9NSDE,2.4,2.4,0.0
6247,A3JTBJC5WSEZ7Q,2.4,2.4,0.0
6248,A3JU9CWXUVHUPU,2.4,2.4,0.0


Mean abs diff I1: 0.0007480444017118762
Mean abs diff I2: 0.0


## Cell 12 — Save results (optional)

In [None]:
pred_I1_top5.to_csv("pred_I1_top5.csv", index=False)
pred_I1_top10.to_csv("pred_I1_top10.csv", index=False)
pred_I2_top5.to_csv("pred_I2_top5.csv", index=False)
pred_I2_top10.to_csv("pred_I2_top10.csv", index=False)
cmp_I1.to_csv("compare_I1_top5_vs_top10.csv", index=False)
cmp_I2.to_csv("compare_I2_top5_vs_top10.csv", index=False)