In [269]:
import numpy as np
import pandas as pd

In [270]:
column_names = ['user_id', 'item_id', 'rating','timestamp']
ratings= pd.read_csv('C:/Users/saads/RecSys/u.data', sep="\t", names = column_names)

In [271]:
ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742


In [272]:
moviePivot = ratings.pivot_table(index='user_id',columns='item_id',values='rating',aggfunc='mean')
moviePivot.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [280]:
# Keep mask of missing ratings
R_original = moviePivot.values

# If 0 means missing in R:
missing_mask = (R_original == 0)   # True = missing, False = has rating
print(missing_mask)


[[False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]
 ...
 [False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]]


In [281]:
# True = unrated
R = np.nan_to_num(R_original)       # Fill NaN with 0 for matrix factorization
missingmark = (R == 0)
print(missingmark)

[[ True  True  True ...  True  True  True]
 [False False False ...  True  True  True]
 [False  True  True ...  True  True  True]
 ...
 [False  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 [ True False  True ...  True  True  True]]


In [282]:

num_users, num_items = R.shape
print("Users:", num_users, "Items:", num_items)

Users: 944 Items: 1682


In [283]:
print(num_users, num_items)

944 1682


In [284]:

K = 10  # latent features
np.random.seed(42)
P = np.random.rand(num_users, K) * 0.01
Q = np.random.rand(num_items, K) * 0.01

In [285]:
def matrix_factorization(R, P, Q, K, steps=20, alpha=0.002, beta=0.02):
    Q = Q.T
    for step in range(steps):
        for i in range(R.shape[0]):
            for j in range(R.shape[1]):
                if R[i, j] > 0:
                    eij = R[i, j] - np.dot(P[i, :], Q[:, j])
                    for k in range(K):
                        P[i, k] += alpha * (2 * eij * Q[k, j] - beta * P[i, k])
                        Q[k, j] += alpha * (2 * eij * P[i, k] - beta * Q[k, j])
        if step % 10 == 0:
            eR = np.dot(P, Q)
            error = 0
            for i in range(R.shape[0]):
                for j in range(R.shape[1]):
                    if R[i, j] > 0:
                        error += (R[i, j] - np.dot(P[i, :], Q[:, j])) ** 2
                        error += (beta/2) * (np.sum(P[i, :]**2) + np.sum(Q[:, j]**2))
            print(f"Step {step}, error: {error:.4f}")
    return P, Q.T

In [286]:
nP, nQ = matrix_factorization(R_scaled, P, Q, K, steps=50)

Step 0, error: 54814.5683
Step 10, error: 13209.0925
Step 20, error: 6456.0049
Step 30, error: 5483.4041
Step 40, error: 5180.5300


In [290]:
predicted_ratings = np.dot(nP, nQ.T) * R_max

In [300]:
def recommend_movies(user_id, R, predicted_ratings, movie_ids, missing_mask, top_n=5):
    user_idx = user_id
    unrated_indices = np.where(missing_mask[user_idx])[0]

    if len(unrated_indices) == 0:
        print("User has rated all movies!")
        return

    recommended_indices = unrated_indices[np.argsort(predicted_ratings[user_idx][unrated_indices])[::-1]]

    print(f"Top {top_n} recommendations for User {user_id}:")
    for idx in recommended_indices[:top_n]:
        print(f"Movie ID {movie_ids[idx]} - Predicted rating: {predicted_ratings[user_idx][idx]:.2f}")


In [302]:
movie_ids = list(moviePivot.columns)
recommend_movies(
    user_id=user_idx,
    R=R,
    predicted_ratings=predicted_ratings,
    movie_ids=movie_ids,
    missing_mask=missing_mask,
    top_n=5
)


User has rated all movies!


In [303]:
import pandas as pd
import numpy as np

# Load data
column_names = ['user_id', 'item_id', 'rating','timestamp']
ratings = pd.read_csv('C:/Users/saads/RecSys/u.data', sep="\t", names=column_names)

# Create pivot table
moviePivot = ratings.pivot_table(index='user_id', columns='item_id', values='rating', aggfunc='mean')
print("MoviePivot shape:", moviePivot.shape)
moviePivot.head()

# Handle missing values properly
R_original = moviePivot.values
print("Original R shape:", R_original.shape)

# Create missing mask (True = missing/NaN, False = has rating)
missing_mask = np.isnan(R_original)
print("Missing values:", np.sum(missing_mask))

# Fill NaN with 0 for matrix factorization
R = np.nan_to_num(R_original, nan=0.0)
print("R after filling NaN:", R.shape)

# Get matrix dimensions
num_users, num_items = R.shape
print("Users:", num_users, "Items:", num_items)

# Scale ratings for better convergence (optional but recommended)
R_nonzero = R[R > 0]
R_min = np.min(R_nonzero) if len(R_nonzero) > 0 else 1
R_max = np.max(R_nonzero) if len(R_nonzero) > 0 else 5
R_scaled = np.where(R > 0, (R - R_min) / (R_max - R_min), 0)

print(f"Rating range: {R_min} to {R_max}")

# Initialize matrices
K = 10  # latent features
np.random.seed(42)
P = np.random.rand(num_users, K) * 0.01
Q = np.random.rand(num_items, K) * 0.01

def matrix_factorization(R, P, Q, K, steps=20, alpha=0.002, beta=0.02):
    """
    Fixed matrix factorization with proper error calculation
    """
    Q = Q.T  # Transpose Q for easier computation
    
    for step in range(steps):
        # Update factors
        for i in range(R.shape[0]):
            for j in range(R.shape[1]):
                if R[i, j] > 0:  # Only update for observed ratings
                    eij = R[i, j] - np.dot(P[i, :], Q[:, j])
                    for k in range(K):
                        P[i, k] += alpha * (2 * eij * Q[k, j] - beta * P[i, k])
                        Q[k, j] += alpha * (2 * eij * P[i, k] - beta * Q[k, j])
        
        # Calculate and print error every 10 steps
        if step % 10 == 0:
            error = 0
            # Calculate reconstruction error only for observed ratings
            for i in range(R.shape[0]):
                for j in range(R.shape[1]):
                    if R[i, j] > 0:
                        error += (R[i, j] - np.dot(P[i, :], Q[:, j])) ** 2
            
            # Add regularization terms (once per step, not per rating)
            error += (beta/2) * (np.sum(P**2) + np.sum(Q**2))
            print(f"Step {step}, error: {error:.4f}")
    
    return P, Q.T

# Run matrix factorization
print("Running matrix factorization...")
nP, nQ = matrix_factorization(R_scaled, P, Q, K, steps=50)

# Generate predictions (scale back to original range)
predicted_ratings = np.dot(nP, nQ.T) * (R_max - R_min) + R_min

def recommend_movies(user_id, R, predicted_ratings, movie_ids, missing_mask, top_n=5):
    """
    Recommend movies for a user
    """
    # Convert user_id to index (user_id might be 1-indexed while array is 0-indexed)
    try:
        # If user_id is in the moviePivot index, get its position
        if user_id in moviePivot.index:
            user_idx = list(moviePivot.index).index(user_id)
        else:
            user_idx = user_id - 1  # Assume 1-indexed user_id
            
        if user_idx < 0 or user_idx >= len(missing_mask):
            print(f"User ID {user_id} not found!")
            return
            
    except:
        user_idx = user_id  # Use as is if it's already an index
    
    # Find unrated movies (where missing_mask is True)
    unrated_indices = np.where(missing_mask[user_idx])[0]
    
    if len(unrated_indices) == 0:
        print(f"User {user_id} has rated all movies!")
        return
    
    # Get predicted ratings for unrated movies and sort by highest predicted rating
    unrated_predictions = predicted_ratings[user_idx][unrated_indices]
    recommended_indices = unrated_indices[np.argsort(unrated_predictions)[::-1]]
    
    print(f"Top {top_n} recommendations for User {user_id}:")
    for idx in recommended_indices[:top_n]:
        movie_id = movie_ids[idx]
        pred_rating = predicted_ratings[user_idx][idx]
        print(f"Movie ID {movie_id} - Predicted rating: {pred_rating:.2f}")

# Get movie IDs from the pivot table columns
movie_ids = list(moviePivot.columns)

# Test recommendation for user 1
test_user_id = 1
print(f"\nTesting recommendations for user {test_user_id}:")
recommend_movies(
    user_id=test_user_id,
    R=R,
    predicted_ratings=predicted_ratings,
    movie_ids=movie_ids,
    missing_mask=missing_mask,
    top_n=5
)

MoviePivot shape: (944, 1682)
Original R shape: (944, 1682)
Missing values: 1487805
R after filling NaN: (944, 1682)
Users: 944 Items: 1682
Rating range: 1.0 to 5.0
Running matrix factorization...
Step 0, error: 47842.6142
Step 10, error: 13094.3295
Step 20, error: 5638.5403
Step 30, error: 4550.9546
Step 40, error: 4206.1260

Testing recommendations for user 1:
Top 5 recommendations for User 1:
Movie ID 408 - Predicted rating: 4.74
Movie ID 318 - Predicted rating: 4.68
Movie ID 483 - Predicted rating: 4.65
Movie ID 603 - Predicted rating: 4.55
Movie ID 513 - Predicted rating: 4.55
