# PCA _ Mean Filling 

**Mohamed desoky**
**222101362**

**Target Items**:
- `B00PCSVODW`
- `B005GISDXW`


In [1]:
import numpy as np
import pandas as pd

DATA_PATH = "../data/Movies_and_TV.csv"  
COLS = ["item_id","user_id","rating","timestamp"]
I1 = "B00PCSVODW"
I2 = "B005GISDXW"
TARGETS = [I1, I2]
MIN_COMMON_USERS = 3      
MAX_CANDIDATES = 5000     

PCA_K = 2 
PCA_USER_SAMPLE = 5000


## 1) Load Data + Basic Cleaning




In [2]:
df = pd.read_csv(DATA_PATH, header=None, names=COLS)
df["item_id"] = df["item_id"].astype(str).str.strip()
df["user_id"] = df["user_id"].astype(str).str.strip()
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")

df = df.dropna(subset=["item_id","user_id","rating"]).copy()
df["rating"] = df["rating"].astype(np.float32)

df = df.groupby(["user_id","item_id"], as_index=False)["rating"].mean()

print("df shape:", df.shape)
print(df.head())


df shape: (8506849, 3)
                user_id     item_id  rating
0  A00013803RVZPCZKTT9U  B003ZTNT2Y     1.0
1  A0001392IVCRENBEIEYS  6302409365     5.0
2  A0001598OL7FAN6XNMK9  B00BMRTPEM     5.0
3  A0001598OL7FAN6XNMK9  B00IV3FLO8     4.0
4  A0001598OL7FAN6XNMK9  B00OGL6S64     5.0


## 2) Validate Target Items Exist



In [3]:
present = {t: df["item_id"].eq(t).any() for t in TARGETS}
print("Targets present:", present)

if not all(present.values()):
    print("\nExample item_id values:", df["item_id"].dropna().astype(str).unique()[:20])
    raise ValueError("One or both target items were not found in item_id column. Check item_id format.")


Targets present: {'B00PCSVODW': True, 'B005GISDXW': True}


## 3) (1) Average Rating for Target Items + (3) Average Rating for Each Item



In [4]:
item_mean = df.groupby("item_id")["rating"].mean().astype(np.float32)

# (1) average rating for target items
target_means = item_mean.loc[TARGETS]
print("Target means (Step 1):")
print(target_means)

# (3) average rating for each item
print("\nTotal items with mean:", item_mean.shape[0])
item_mean.head()


Target means (Step 1):
item_id
B00PCSVODW    1.447853
B005GISDXW    1.453237
Name: rating, dtype: float32

Total items with mean: 182032


item_id
0000143502    5.0
0000143529    5.0
0000143561    3.5
0000143588    4.7
0000695009    4.0
Name: rating, dtype: float32

## 4) (2) Mean Filling for Target Items (Concept)




In [5]:
_ratings_by_item = df.groupby("item_id").apply(
    lambda g: g.set_index("user_id")["rating"].to_dict()
).to_dict()

def pair_stats_long(df, item_mean, item_a, item_b):
    ra_dict = _ratings_by_item.get(item_a, {})
    rb_dict = _ratings_by_item.get(item_b, {})
    
    common_users = set(ra_dict.keys()) & set(rb_dict.keys())
    n = len(common_users)
    if n < 2:
        return np.nan, np.nan, n
    
   
    ra = np.array([ra_dict[u] for u in common_users], dtype=np.float32)
    rb = np.array([rb_dict[u] for u in common_users], dtype=np.float32)
  
    ca = ra - float(item_mean[item_a])
    cb = rb - float(item_mean[item_b])
    
    cov = float(np.dot(ca, cb) / (n - 1))
    
    na = float(np.linalg.norm(ca))
    nb = float(np.linalg.norm(cb))
    cos = float(np.dot(ca, cb) / (na * nb + 1e-12))
    
    return cov, cos, n

def candidate_items(df, target_item, max_candidates=None):
    target_users = set(_ratings_by_item.get(target_item, {}).keys())
    
    cand = set()
    for item, users_dict in _ratings_by_item.items():
        if item != target_item:
            if target_users & set(users_dict.keys()):
                cand.add(item)
    
    cand = list(cand)
    if max_candidates is not None and len(cand) > max_candidates:
        cand_scores = [(c, len(target_users & set(_ratings_by_item[c].keys()))) for c in cand]
        cand_scores.sort(key=lambda x: x[1], reverse=True)
        cand = [c for c, _ in cand_scores[:max_candidates]]
    
    return cand

  _ratings_by_item = df.groupby("item_id").apply(


## 6) (7) Top-5 and Top-10 Peers (by Covariance) + Cosine Column



In [6]:
def top_k_peers(df, item_mean, target_item, k=10, min_common=5, max_candidates=None):
    cands = candidate_items(df, target_item, max_candidates=max_candidates)
    print(f"Processing {len(cands)} candidates for {target_item}...")
    
    rows = []
    for i, c in enumerate(cands):
        if (i + 1) % 1000 == 0:
            print(f"  Processed {i + 1}/{len(cands)} candidates...")
        cov, cos, n = pair_stats_long(df, item_mean, target_item, c)
        if n >= min_common and np.isfinite(cov):
            rows.append((c, cov, cos, n))
    
    out = pd.DataFrame(rows, columns=["peer_item","covariance","cosine","n_common"])
    out = out.sort_values("covariance", ascending=False).reset_index(drop=True)
    print(f"Found {len(out)} valid peers for {target_item}")
    return out.head(k), out

top10_I1, all_I1 = top_k_peers(df, item_mean, I1, k=10, min_common=MIN_COMMON_USERS, max_candidates=MAX_CANDIDATES)
top10_I2, all_I2 = top_k_peers(df, item_mean, I2, k=10, min_common=MIN_COMMON_USERS, max_candidates=MAX_CANDIDATES)

top5_I1 = top10_I1.head(5)
top5_I2 = top10_I2.head(5)

print("Top-5 peers for I1 (Step 7):")
display(top5_I1)

print("Top-10 peers for I1:")
display(top10_I1)

print("Top-5 peers for I2 (Step 7):")
display(top5_I2)

print("Top-10 peers for I2:")
display(top10_I2)

Processing 1255 candidates for B00PCSVODW...
  Processed 1000/1255 candidates...
Found 16 valid peers for B00PCSVODW
Processing 909 candidates for B005GISDXW...
Found 10 valid peers for B005GISDXW
Top-5 peers for I1 (Step 7):


Unnamed: 0,peer_item,covariance,cosine,n_common
0,B00TKIJGDA,1.394654,0.761724,3
1,B000006B4Y,1.174917,0.822492,3
2,B00HSJ2CVQ,0.696592,0.639306,3
3,B00NCDVVLY,0.507928,0.518404,3
4,B0090JBOC0,0.390683,0.275748,5


Top-10 peers for I1:


Unnamed: 0,peer_item,covariance,cosine,n_common
0,B00TKIJGDA,1.394654,0.761724,3
1,B000006B4Y,1.174917,0.822492,3
2,B00HSJ2CVQ,0.696592,0.639306,3
3,B00NCDVVLY,0.507928,0.518404,3
4,B0090JBOC0,0.390683,0.275748,5
5,B00005RFHF,0.248986,0.639605,5
6,6303162290,0.150472,0.430142,3
7,B00PH1H6TK,0.086408,0.178911,3
8,B00J22YU62,0.014812,0.017595,3
9,B00SFRHKAI,-0.068101,-0.103109,3


Top-5 peers for I2 (Step 7):


Unnamed: 0,peer_item,covariance,cosine,n_common
0,6305480869,1.104002,0.555135,3
1,B00ZL4Q7NE,0.862015,0.433506,7
2,B00HW3EI3I,0.48593,0.360607,3
3,B00BTFK07I,0.354236,1.0,3
4,B00FL31UF0,0.261219,0.373409,5


Top-10 peers for I2:


Unnamed: 0,peer_item,covariance,cosine,n_common
0,6305480869,1.104002,0.555135,3
1,B00ZL4Q7NE,0.862015,0.433506,7
2,B00HW3EI3I,0.48593,0.360607,3
3,B00BTFK07I,0.354236,1.0,3
4,B00FL31UF0,0.261219,0.373409,5
5,B00IKM5LXG,0.18417,0.276155,3
6,B00HUTPK4U,-0.108623,-0.246506,3
7,B004K6FS5W,-0.291161,-0.480681,4
8,B000E372ZY,-0.644562,-0.895422,3
9,B00FRE6OFO,-1.189883,-0.882963,4


## 7) (9)(11) Predict Missing Ratings (Item-Based, Weighted by Covariance)

Prediction formula (item-based CF):
\[
\hat r(u, i)=\frac{\sum_{j\in Peers(i)} w_{i,j}\, r(u,j)}{\sum_{j\in Peers(i)} |w_{i,j}|}
\]
where \(w_{i,j}\) is the covariance weight.

If a user has no ratings for peer items, we return the target item mean (mean-filling behavior).


In [7]:
def predict_target_for_users(df, target_item, peers_df, default_mean):
    num = pd.Series(dtype=np.float64)
    den = pd.Series(dtype=np.float64)

    for peer, w in zip(peers_df["peer_item"].values, peers_df["covariance"].values):
        peer_r = df.loc[df["item_id"] == peer, ["user_id","rating"]].set_index("user_id")["rating"].astype(np.float64)
        num = num.add(peer_r * float(w), fill_value=0.0)
        den = den.add(pd.Series(np.abs(float(w)), index=peer_r.index, dtype=np.float64), fill_value=0.0)

    pred = num / (den + 1e-12)

    users_all = df["user_id"].unique()
    pred = pred.reindex(users_all)

    pred = pred.fillna(float(default_mean))
    return pred

pred_I1_top5  = predict_target_for_users(df, I1, top5_I1,  item_mean[I1])
pred_I1_top10 = predict_target_for_users(df, I1, top10_I1, item_mean[I1])

pred_I2_top5  = predict_target_for_users(df, I2, top5_I2,  item_mean[I2])
pred_I2_top10 = predict_target_for_users(df, I2, top10_I2, item_mean[I2])

print("Predictions ready. Example:")
display(pd.DataFrame({
    "pred_I1_top5": pred_I1_top5.head(10),
    "pred_I1_top10": pred_I1_top10.head(10),
    "pred_I2_top5": pred_I2_top5.head(10),
    "pred_I2_top10": pred_I2_top10.head(10),
}))


Predictions ready. Example:


Unnamed: 0_level_0,pred_I1_top5,pred_I1_top10,pred_I2_top5,pred_I2_top10
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A00013803RVZPCZKTT9U,1.447853,1.447853,1.453237,1.453237
A0001392IVCRENBEIEYS,1.447853,1.447853,1.453237,1.453237
A0001598OL7FAN6XNMK9,1.447853,1.447853,1.453237,1.453237
A0002090WKEMAO8KOWKM,1.447853,1.447853,1.453237,1.453237
A00049826E18XJLZ3YC0,1.447853,1.447853,1.453237,1.453237
A0005426V58WVW05LDKK,1.447853,1.447853,1.453237,1.453237
A0005916MHK9RK69491E,1.447853,1.447853,1.453237,1.453237
A0007042BQBQLK20MOG7,1.447853,1.447853,1.453237,1.453237
A0007430W3WXY3QNYB2S,1.447853,1.447853,1.453237,1.453237
A00086729ZDSXGG2E481,1.447853,1.447853,1.453237,1.453237


## 8) (8)(10) PCA From Scratch on Union of Peers (Top-5 and Top-10)

We create a **small** user×item matrix using only:
- Union of Top-5 peers (for Step 8)
- Union of Top-10 peers (for Step 10)

Matrix is mean-filled by item mean and then centered, then PCA computed using eigen-decomposition of covariance.


In [8]:
def make_user_item_matrix(df, users, items, item_mean):
    users_list = list(users)
    items_list = list(items)
    users_set = set(users_list)
    items_set = set(items_list)
    
    if not items_list:
        return np.zeros((len(users_list), 0), dtype=np.float32)

    U = {u: i for i, u in enumerate(users_list)}
    I = {it: j for j, it in enumerate(items_list)}
    
    X = np.zeros((len(users_list), len(items_list)), dtype=np.float32)
    

    for j, it in enumerate(items_list):
        X[:, j] = float(item_mean.get(it, 0.0))
    
    sub = df[df["user_id"].isin(users_set) & df["item_id"].isin(items_set)].copy()
    if len(sub) > 0:
        sub["row_idx"] = sub["user_id"].map(U)
        sub["col_idx"] = sub["item_id"].map(I)
        valid = sub.dropna(subset=["row_idx", "col_idx"])
        rows = valid["row_idx"].astype(int).values
        cols = valid["col_idx"].astype(int).values
        X[rows, cols] = valid["rating"].values
    
    mu = np.array([float(item_mean.get(it, 0.0)) for it in items_list], dtype=np.float32).reshape(1, -1)
    return X - mu

def pca_from_scratch(X_centered, k=2):

    n, d = X_centered.shape
    if d == 0:
        return np.zeros((n, k)), np.zeros((d, k)), np.zeros(min(d, k))
        
    Cov = (X_centered.T @ X_centered) / (n - 1)
   
    if d < k:
        k = d
        
    eig_vals, eig_vecs = np.linalg.eigh(Cov)
    idx = np.argsort(eig_vals)[::-1]
    eig_vals = eig_vals[idx]
    eig_vecs = eig_vecs[:, idx]
    W = eig_vecs[:, :k]
    Z = X_centered @ W
    return Z, W, eig_vals

users_sample = df["user_id"].unique()[:PCA_USER_SAMPLE]

top5_union = sorted(set(top5_I1["peer_item"]).union(set(top5_I2["peer_item"])))
top10_union = sorted(set(top10_I1["peer_item"]).union(set(top10_I2["peer_item"])))

if not top5_union:
    print("WARNING: No peers found for Top-5 union. Check MIN_COMMON_USERS.")
    Z5 = np.zeros((len(users_sample), PCA_K))
else:
    print(f"Building PCA matrix for {len(users_sample)} users x {len(top5_union)} items (Top-5)...")
    X5 = make_user_item_matrix(df, users_sample, top5_union, item_mean)
    Z5, W5, eig5 = pca_from_scratch(X5, k=PCA_K)

if not top10_union:
    print("WARNING: No peers found for Top-10 union.")
    Z10 = np.zeros((len(users_sample), PCA_K))
else:
    print(f"Building PCA matrix for {len(users_sample)} users x {len(top10_union)} items (Top-10)...")
    X10 = make_user_item_matrix(df, users_sample, top10_union, item_mean)
    Z10, W10, eig10 = pca_from_scratch(X10, k=PCA_K)

Z5_df = pd.DataFrame(Z5, index=users_sample, columns=[f"PC{i+1}" for i in range(Z5.shape[1])])
Z10_df = pd.DataFrame(Z10, index=users_sample, columns=[f"PC{i+1}" for i in range(Z10.shape[1])])

print("Reduced space (Top-5 union) sample:")
display(Z5_df.head())

print("Reduced space (Top-10 union) sample:")
display(Z10_df.head())

Building PCA matrix for 5000 users x 10 items (Top-5)...
Building PCA matrix for 5000 users x 20 items (Top-10)...
Reduced space (Top-5 union) sample:


Unnamed: 0,PC1,PC2
A00013803RVZPCZKTT9U,0.0,0.0
A0001392IVCRENBEIEYS,0.0,0.0
A0001598OL7FAN6XNMK9,0.0,0.0
A0002090WKEMAO8KOWKM,0.0,0.0
A00049826E18XJLZ3YC0,0.0,0.0


Reduced space (Top-10 union) sample:


Unnamed: 0,PC1,PC2
A00013803RVZPCZKTT9U,0.0,0.0
A0001392IVCRENBEIEYS,0.0,0.0
A0001598OL7FAN6XNMK9,0.0,0.0
A0002090WKEMAO8KOWKM,0.0,0.0
A00049826E18XJLZ3YC0,0.0,0.0


## 9) (12) Compare Top-5 vs Top-10 Predictions

You can compare stability/smoothing between using 5 peers and 10 peers.


In [9]:
comparison = pd.DataFrame({
    "pred_I1_top5": pred_I1_top5,
    "pred_I1_top10": pred_I1_top10,
    "pred_I2_top5": pred_I2_top5,
    "pred_I2_top10": pred_I2_top10,
})
display(comparison.head(20))



Unnamed: 0_level_0,pred_I1_top5,pred_I1_top10,pred_I2_top5,pred_I2_top10
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A00013803RVZPCZKTT9U,1.447853,1.447853,1.453237,1.453237
A0001392IVCRENBEIEYS,1.447853,1.447853,1.453237,1.453237
A0001598OL7FAN6XNMK9,1.447853,1.447853,1.453237,1.453237
A0002090WKEMAO8KOWKM,1.447853,1.447853,1.453237,1.453237
A00049826E18XJLZ3YC0,1.447853,1.447853,1.453237,1.453237
A0005426V58WVW05LDKK,1.447853,1.447853,1.453237,1.453237
A0005916MHK9RK69491E,1.447853,1.447853,1.453237,1.453237
A0007042BQBQLK20MOG7,1.447853,1.447853,1.453237,1.453237
A0007430W3WXY3QNYB2S,1.447853,1.447853,1.453237,1.453237
A00086729ZDSXGG2E481,1.447853,1.447853,1.453237,1.453237


In [None]:

import os

RESULTS_PATH = "../results"
TABLES_PATH = os.path.join(RESULTS_PATH, "tables")
os.makedirs(TABLES_PATH, exist_ok=True)

item_mean.to_csv(os.path.join(RESULTS_PATH, "pca_item_means.csv"), header=True)
print(f"Saved: {os.path.join(RESULTS_PATH, 'pca_item_means.csv')}")

target_means.to_frame("mean").to_csv(os.path.join(TABLES_PATH, "pca_target_means.csv"))
print(f"Saved: {os.path.join(TABLES_PATH, 'pca_target_means.csv')}")

top5_I1.to_csv(os.path.join(TABLES_PATH, "pca_peers_I1_top5.csv"), index=False)
top10_I1.to_csv(os.path.join(TABLES_PATH, "pca_peers_I1_top10.csv"), index=False)
top5_I2.to_csv(os.path.join(TABLES_PATH, "pca_peers_I2_top5.csv"), index=False)
top10_I2.to_csv(os.path.join(TABLES_PATH, "pca_peers_I2_top10.csv"), index=False)
print(f"Saved: peer tables for I1 and I2 (top5, top10)")

predictions_df = pd.DataFrame({
    "pred_I1_top5": pred_I1_top5,
    "pred_I1_top10": pred_I1_top10,
    "pred_I2_top5": pred_I2_top5,
    "pred_I2_top10": pred_I2_top10,
})
predictions_df.to_csv(os.path.join(RESULTS_PATH, "pca_predictions.csv"))
print(f"Saved: {os.path.join(RESULTS_PATH, 'pca_predictions.csv')}")

Z5_df.to_csv(os.path.join(RESULTS_PATH, "pca_reduced_top5.csv"))
Z10_df.to_csv(os.path.join(RESULTS_PATH, "pca_reduced_top10.csv"))
print(f"Saved: PCA reduced matrices (top5, top10)")

if 'eig5' in dir() and 'eig10' in dir():
    pd.DataFrame({
        "PC": [f"PC{i+1}" for i in range(len(eig5))],
        "eigenvalue_top5": eig5,
    }).to_csv(os.path.join(TABLES_PATH, "pca_eigenvalues_top5.csv"), index=False)
    
    pd.DataFrame({
        "PC": [f"PC{i+1}" for i in range(len(eig10))],
        "eigenvalue_top10": eig10,
    }).to_csv(os.path.join(TABLES_PATH, "pca_eigenvalues_top10.csv"), index=False)
    print(f"Saved: eigenvalue tables")

if 'W5' in dir() and 'W10' in dir():
    pd.DataFrame(W5, index=top5_union, columns=[f"PC{i+1}" for i in range(W5.shape[1])]).to_csv(
        os.path.join(RESULTS_PATH, "pca_loadings_top5.csv"))
    pd.DataFrame(W10, index=top10_union, columns=[f"PC{i+1}" for i in range(W10.shape[1])]).to_csv(
        os.path.join(RESULTS_PATH, "pca_loadings_top10.csv"))
    print(f"Saved: PCA loadings (W matrices)")

np.save(os.path.join(RESULTS_PATH, "pca_X5_centered.npy"), X5)
np.save(os.path.join(RESULTS_PATH, "pca_X10_centered.npy"), X10)
print(f"Saved: centered matrices as .npy files")


pd.Series(users_sample, name="user_id").to_csv(
    os.path.join(RESULTS_PATH, "pca_users_sample.csv"), index=False)
print(f"Saved: user sample list")

Saved: ../results\pca_item_means.csv
Saved: ../results\tables\pca_target_means.csv
Saved: peer tables for I1 and I2 (top5, top10)
Saved: ../results\pca_predictions.csv
Saved: PCA reduced matrices (top5, top10)
Saved: eigenvalue tables
Saved: PCA loadings (W matrices)
Saved: centered matrices as .npy files
Saved: user sample list

✅ All PCA Mean-Filling results saved to: ../results
