In [42]:
import pandas as pd
import numpy as np

In [43]:
column_names=['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('C:/Users/saads/RecSys/u.data', sep="\t", names=column_names)

In [44]:
movie_db = pd.read_csv("C:/Users/saads/RecSys/Movie_Id_Titles (1).txt")

In [45]:
df = pd.merge(df,movie_db, on="item_id")
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,0,50,5,881250949,Star Wars (1977)
1,290,50,5,880473582,Star Wars (1977)
2,79,50,4,891271545,Star Wars (1977)
3,2,50,5,888552084,Star Wars (1977)
4,8,50,5,879362124,Star Wars (1977)


In [46]:
moviePivot = df.pivot_table(index="user_id", columns="item_id" , values="rating" , aggfunc="mean")
moviePivot.shape

(944, 1682)

In [47]:
# Handle missing values properly
R_original = moviePivot.values

# Create missing mask (True = missing/NaN, False = has rating)
missing_mark = (np.isnan(R_original))

# Fill NaN with 0 for matrix factorization
R = np.nan_to_num(R_original)
# Get matrix dimensions
print(f"R matrix dimensions are: {R.shape}")
# Scale ratings for better convergence (optional but recommended)

R matrix dimensions are: (944, 1682)


In [48]:
num_user,num_items = R.shape

# Initialize matrices
K=10 # latent features
np.random.seed(42)

P = np.random.rand(num_user,K) * 0.01
Q = np.random.rand(num_items,K) * 0.01



In [56]:
def matrix_factorization(R, P, Q, K, steps=20, alpha=0.002, beta=0.02):
    """
    Fixed matrix factorization with proper error calculation
    """
    
    Q = Q.T
    for step in range(steps):
       
        for i in range(R.shape[0]):
            for j in range(R.shape[1]):
                if R[i,j] > 0:
                    eij = R[i,j] - np.dot(P[i, :], Q[:, j])
                    for k in range(K):
                        P[i,k] += alpha * (2 * eij * Q[k,j] - beta * P[i,k])
                        Q[k,j] += alpha * (2 * eij * P[i,k] - beta * Q[k,j])
        
      
        if step % 10 == 0:
            error = 0
            for i in range(R.shape[0]):
                for j in range(R.shape[1]):
                    if R[i,j] > 0:
                        error += (R[i,j] - np.dot(P[i, :], Q[:, j])) ** 2
            
            # Add regularization terms (once per step)
            error += (beta/2) * (np.sum(P**2) + np.sum(Q**2))
            print(f"Step {step}, error: {error:.4f}")
                
    return P, Q.T

In [58]:
# Run matrix factorization
print("Running matrix factorization....")
nP, nQ = matrix_factorization(R,P,Q,K,steps=50)

# Generate predictions (scale back to original range)
predicted_ratings = np.dot(nP,nQ.T)

Running matrix factorization....
Step 0, error: 59216.1000
Step 10, error: 56663.2108
Step 20, error: 54952.9278
Step 30, error: 53732.7606
Step 40, error: 52826.6602


In [74]:
movie_mapping = dict(zip(df['item_id'], df['title']))
def recommend_movies(user_id, predicted_ratings, movie_names, missing_mark, top_n=5):
    # Get user index - handle both 0-indexed and 1-indexed user_ids
    user_idx = user_id - 1 if user_id > 0 and user_id <= len(missing_mark) else user_id
    
    # Find unrated movies and get top recommendations
    unrated_indices = np.where(missing_mark[user_idx])[0]
    if len(unrated_indices) == 0:
        print(f"User {user_id} has rated all movies!")
        return
    
    # Sort unrated movies by predicted rating (highest first)
    top_indices = unrated_indices[np.argsort(predicted_ratings[user_idx][unrated_indices])[::-1]]
    
    print(f"Top {top_n} recommendations for User {user_id}:")
    for idx in top_indices[:top_n]:
        movie_id = moviePivot.columns[idx]  # Get movie ID from pivot columns
        movie_name = movie_mapping.get(movie_id, f"Movie ID {movie_id}")
        print(f"{movie_name} - Predicted rating: {predicted_ratings[user_idx][idx]:.2f}")

# Usage
movie_ids = list(moviePivot.columns)
recommend_movies(1, predicted_ratings, movie_names, missing_mark, top_n=5)

Top 5 recommendations for User 1:
Return of the Jedi (1983) - Predicted rating: 4.22
Raiders of the Lost Ark (1981) - Predicted rating: 4.16
Princess Bride, The (1987) - Predicted rating: 4.15
Usual Suspects, The (1995) - Predicted rating: 4.06
Blues Brothers, The (1980) - Predicted rating: 3.96


In [76]:
!git remote set-url origin "https://github.com/SyedSaad42/Matrix_Factorisation_fromScratch.git"

In [77]:
!git add .

In [78]:
!git commit -m "Matrix Factorisation (SVD)"

[main 1e08036] Matrix Factorisation (SVD)
 5 files changed, 1244 insertions(+), 31 deletions(-)
 create mode 100644 Untitled Folder/.ipynb_checkpoints/Untitled-checkpoint.ipynb
 create mode 100644 Untitled Folder/.ipynb_checkpoints/Untitled1-checkpoint.ipynb
 create mode 100644 Untitled Folder/Untitled.ipynb
 create mode 100644 Untitled Folder/Untitled1.ipynb


In [79]:
!git branch -M main

In [80]:
!git push origin main

To https://github.com/SyedSaad42/Matrix_Factorisation_fromScratch.git
 * [new branch]      main -> main
