In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from scipy.sparse.linalg import svds
from math import sqrt


## Import data

In [2]:
# Load the ratings data
ratings = pd.read_csv('../data/ratings.csv')
print("Ratings Data:")
print(ratings.head())

# Load the movies data
movies = pd.read_csv('../data/movies.csv')
print("\nMovies Data:")
print(movies.head())


Ratings Data:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931

Movies Data:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


## Explore the Data Structure

In [4]:
# Check the shape of the datasets
print(f"Shape of ratings dataset: {ratings.shape}")
print(f"Shape of movies dataset: {movies.shape}")

# Basic statistics of ratings
print("\nRatings Data Info:")
print(ratings.info())
print("\nRatings Data Description:")
print(ratings.describe())

# Check for missing values in both datasets
print("\nMissing values in ratings dataset:")
print(ratings.isnull().sum())

print("\nMissing values in movies dataset:")
print(movies.isnull().sum())


Shape of ratings dataset: (100836, 4)
Shape of movies dataset: (9742, 3)

Ratings Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None

Ratings Data Description:
              userId        movieId         rating     timestamp
count  100836.000000  100836.000000  100836.000000  1.008360e+05
mean      326.127564   19435.295718       3.501557  1.205946e+09
std       182.618491   35530.987199       1.042529  2.162610e+08
min         1.000000       1.000000       0.500000  8.281246e+08
25%       177.000000    1199.000000       3.000000  1.019124e+09
50%       325.000000    2991.000000       3.500000  1.186087e+09
75%       477.

## Create the User-Item Matrix

In [5]:
# Create a user-item matrix
user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')
print("User-Item Matrix:")
print(user_item_matrix.head())

# Inspect the sparsity of the matrix
sparsity = 1.0 - len(ratings) / float(user_item_matrix.shape[0] * user_item_matrix.shape[1])
print(f"\nSparsity of the User-Item Matrix: {sparsity:.4f}")


User-Item Matrix:
movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           4.0     NaN     4.0     NaN     NaN     4.0     NaN     NaN   
2           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
3           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
4           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
5           4.0     NaN     NaN     NaN     NaN     NaN     NaN     NaN   

movieId  9       10      ...  193565  193567  193571  193573  193579  193581  \
userId                   ...                                                   
1           NaN     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN   
2           NaN     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN   
3           NaN     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN   
4           NaN     NaN  ...     NaN     NaN     NaN    

Computing the **sparsity** of the user-item matrix is an important step in understanding the structure of your data and determining which recommendation algorithms are most appropriate.

#### What is Sparsity?

**Sparsity** measures how many values in a matrix are missing or zero compared to the total number of possible values. In the context of a user-item matrix for a recommendation system, sparsity can be calculated as:

$$
\text{Sparsity} = 1 - \frac{\text{Number of non-zero (or filled) entries}}{\text{Total number of possible entries}}
$$

A high sparsity value means that most of the entries in the matrix are missing (or zero), while a low sparsity value indicates that most entries are filled.

#### Why is Sparsity Important in Recommendation Systems?

1. **Understanding Data Characteristics**: 
   - Real-world user-item matrices are often very sparse because users typically interact with only a small subset of available items (e.g., a user may only watch a few of the thousands of movies on a streaming service). Computing sparsity helps quantify this characteristic of your data.

2. **Choosing the Right Algorithm**:
   - Different recommendation algorithms perform differently depending on the sparsity of the data:
     - **Collaborative Filtering (User-Based or Item-Based)**: Can work well with sparse data but may struggle if the matrix is extremely sparse, leading to insufficient data to compute meaningful similarities.
     - **Matrix Factorization Methods (e.g., SVD)**: Often more effective with sparse matrices as they learn latent factors that explain observed interactions.
     - **Content-Based Filtering**: Does not directly depend on the sparsity of the user-item matrix but may suffer if there is a lack of metadata for items.

3. **Impact on Model Performance**:
   - High sparsity means less data for training, which can negatively affect model performance. Knowing the sparsity helps set realistic expectations for the performance of different recommendation models.

4. **Data Imputation or Densification Strategies**:
   - If the matrix is too sparse, you might need to "densify" it using techniques such as filling missing values with the average rating, user/item bias, or more sophisticated imputation methods.

5. **Memory and Computation Efficiency**:
   - Sparse data structures are more memory-efficient and faster to process, especially in matrix factorization techniques. Knowing the sparsity allows you to choose appropriate data storage formats (such as sparse matrices) that save memory and computation time.

#### Conclusion

By computing sparsity, one gains insights into the data distribution in your matrix, helping to guide decisions on algorithm selection, data preprocessing, and model evaluation. It is a key metric that influences the strategy and approach you take when developing and optimizing your recommendation system.


## Compute User Similarity

**Cosine similarity** measures the cosine of the angle between two vectors in an n-dimensional space. It ranges from -1 (completely dissimilar) to 1 (identical). In the context of a user-item matrix, each user is represented as a vector of their ratings for all items.

The cosine similarity between two users \(A\) and \(B\) can be computed as:

$$
\text{cosine\_similarity}(A, B) = \frac{\sum_{i=1}^n A_i \times B_i}{\sqrt{\sum_{i=1}^n A_i^2} \times \sqrt{\sum_{i=1}^n B_i^2}}
$$

where:
- \(A_i\) and \(B_i\) are the ratings of users \(A\) and \(B\) for item \(i\).
- \(n\) is the total number of items.

In [14]:
# Function to manually compute cosine similarity between two vectors
def compute_cosine_similarity(vec1, vec2):
    """Compute the cosine similarity between two vectors."""
    # Calculate the dot product of the vectors
    dot_product = np.dot(vec1, vec2)
    
    # Calculate the magnitudes of the vectors
    magnitude_vec1 = np.sqrt(np.sum(vec1 ** 2))
    magnitude_vec2 = np.sqrt(np.sum(vec2 ** 2))
    
    # Compute cosine similarity
    if magnitude_vec1 == 0 or magnitude_vec2 == 0:
        return 0  # Avoid division by zero
    cosine_sim = dot_product / (magnitude_vec1 * magnitude_vec2)
    
    return cosine_sim

# Fill NaN values with 0 in the user-item matrix
user_item_matrix_filled = user_item_matrix.fillna(0)

# Initialize an empty DataFrame to store user similarities
user_similarity_manual = pd.DataFrame(index=user_item_matrix.index, columns=user_item_matrix.index)

# Compute cosine similarity between each pair of users manually
for user1 in user_item_matrix.index:
    for user2 in user_item_matrix.index:
        # Extract the rating vectors for the two users
        vec1 = user_item_matrix_filled.loc[user1].values
        vec2 = user_item_matrix_filled.loc[user2].values
        
        # Compute the cosine similarity
        user_similarity_manual.loc[user1, user2] = compute_cosine_similarity(vec1, vec2)

print("User Similarity Matrix (Cosine Similarity Computed by Hand):")
display(user_similarity_manual.head())

User Similarity Matrix (Cosine Similarity Computed by Hand):


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.027283,0.05972,0.194395,0.12908,0.128152,0.158744,0.136968,0.064263,0.016875,...,0.080554,0.164455,0.221486,0.070669,0.153625,0.164191,0.269389,0.291097,0.093572,0.145321
2,0.027283,1.0,0.0,0.003726,0.016614,0.025333,0.027585,0.027257,0.0,0.067445,...,0.202671,0.016866,0.011997,0.0,0.0,0.028429,0.012948,0.046211,0.027565,0.102427
3,0.05972,0.0,1.0,0.002251,0.00502,0.003936,0.0,0.004941,0.0,0.0,...,0.005048,0.004892,0.024992,0.0,0.010694,0.012993,0.019247,0.021128,0.0,0.032119
4,0.194395,0.003726,0.002251,1.0,0.128659,0.088491,0.11512,0.062969,0.011361,0.031163,...,0.085938,0.128273,0.307973,0.052985,0.084584,0.200395,0.131746,0.149858,0.032198,0.107683
5,0.12908,0.016614,0.00502,0.128659,1.0,0.300349,0.108342,0.429075,0.0,0.030611,...,0.068048,0.418747,0.110148,0.258773,0.148758,0.106435,0.152866,0.135535,0.261232,0.060792


## Predict Ratings Using Collaborative Filtering

#### **1. What is Collaborative Filtering?**

**Collaborative Filtering (CF)** is a technique used in recommendation systems to predict a user's interests by collecting preferences or taste information from many users. The basic assumption is that if two users have agreed on a certain set of items in the past, they are likely to agree on other items in the future.

#### **2. Types of Collaborative Filtering**

Collaborative Filtering can be broadly divided into two categories:

- **User-Based Collaborative Filtering**: Recommends items to a user based on the preferences of similar users. The similarity between users is typically measured using metrics such as cosine similarity, Pearson correlation, or Jaccard similarity. If User A is similar to User B, then the items preferred by User B are recommended to User A.

- **Item-Based Collaborative Filtering**: Recommends items based on the similarity between items. The similarity is computed using the ratings given to the items by all users. If Item X is similar to Item Y, and a user liked Item X, then Item Y is recommended to that user.

#### **3. Key Concepts in Collaborative Filtering**

- **User-Item Matrix**: A matrix where rows represent users, columns represent items, and each cell represents the rating that a user has given to an item. This matrix is usually very sparse, as most users have rated only a small fraction of all available items.

- **Similarity Measures**: Quantifies how similar two users or items are. Common similarity measures include:
  - **Cosine Similarity**: Measures the cosine of the angle between two non-zero vectors.
  - **Pearson Correlation Coefficient**: Measures the linear correlation between two sets of data.
  
  $$
  \text{Pearson Correlation}(A, B) = \frac{\sum_{i=1}^{n} (A_i - \bar{A})(B_i - \bar{B})}{\sqrt{\sum_{i=1}^{n} (A_i - \bar{A})^2} \times \sqrt{\sum_{i=1}^{n} (B_i - \bar{B})^2}}
  $$

  - **Jaccard Similarity**: Measures the similarity between two sets by dividing the size of their intersection by the size of their union.
  
  $$
  \text{Jaccard Similarity}(A, B) = \frac{|A \cap B|}{|A \cup B|}
  $$

- **Neighborhood-Based Methods**: These methods find a set of users or items similar to the target user or item. Recommendations are made based on a weighted combination of these neighbors' preferences.

- **Latent Factor Models**: Uses techniques such as **Singular Value Decomposition (SVD)** to reduce the dimensionality of the user-item matrix, capturing latent features that explain observed ratings.

#### **4. Steps in Collaborative Filtering**

1. **Data Collection**: Gather data on user-item interactions (e.g., ratings, clicks, purchases).

2. **Preprocessing**: Clean the data, handle missing values, normalize ratings, and transform the data into a user-item matrix.

3. **Compute Similarity**:
   - For **user-based CF**, compute similarity between users.
   - For **item-based CF**, compute similarity between items.

4. **Generate Predictions**:
   - **User-Based CF**: Predict the rating a user would give to an item based on the ratings of similar users.
   - **Item-Based CF**: Predict the rating for an item based on the ratings given to similar items by the same user.

5. **Make Recommendations**: Sort the predicted ratings or scores and recommend the top items to the user.

#### **5. Challenges in Collaborative Filtering**

- **Data Sparsity**: The user-item matrix is usually sparse, meaning there are many missing values due to users interacting with only a few items. This sparsity can reduce the accuracy of similarity calculations and predictions.

- **Scalability**: As the number of users and items grows, the computational cost of calculating similarities and generating recommendations increases significantly.

- **Cold Start Problem**: CF struggles to make recommendations for new users or items that have very few or no interactions, as there is no historical data to base predictions on.

#### **6. Advantages of Collaborative Filtering**

- **Domain Independence**: CF does not rely on any domain-specific knowledge, such as item content or user demographics; it only requires user-item interactions.

- **Serendipity**: CF can introduce users to items they might not have found otherwise, leading to unexpected discoveries.

#### **7. Disadvantages of Collaborative Filtering**

- **Cold Start**: CF requires a sufficient amount of data on user-item interactions to make accurate recommendations, making it difficult to handle new users or items.

- **Scalability Issues**: Computing similarities between large numbers of users or items can be computationally expensive, particularly in large-scale systems.

#### **8. Popular Algorithms for Collaborative Filtering**

- **k-Nearest Neighbors (k-NN)**: Finds the k most similar users or items to make predictions.
- **Matrix Factorization Techniques**: Uses techniques like SVD or Alternating Least Squares (ALS) to learn latent factors.
- **Deep Learning Models**: Neural networks can be used to capture complex patterns in user-item interactions, though they often require more data and computational resources.

#### **Conclusion**

Collaborative Filtering is a powerful and widely used approach in recommendation systems due to its simplicity and effectiveness. It leverages the collective wisdom of the crowd to make personalized recommendations. However, challenges like data sparsity, scalability, and the cold start problem need to be addressed to enhance its performance in real-world applications.

In [8]:
# Function to predict ratings based on user similarity
def predict_ratings(user_item_matrix, user_similarity):
    """Predict ratings based on user similarity."""
    mean_user_rating = user_item_matrix.mean(axis=1)
    ratings_diff = (user_item_matrix.T - mean_user_rating).T
    pred = mean_user_rating.values[:, np.newaxis] + user_similarity.dot(ratings_diff.fillna(0)) / np.array([np.abs(user_similarity).sum(axis=1)]).T
    pred_df = pd.DataFrame(pred, index=user_item_matrix.index, columns=user_item_matrix.columns)
    return pred_df

# Predict ratings using collaborative filtering
predicted_ratings_cf = predict_ratings(user_item_matrix, user_similarity_df)
print("Predicted Ratings (Collaborative Filtering):")
display(predicted_ratings_cf.head())


Predicted Ratings (Collaborative Filtering):


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.512156,4.337305,4.333152,4.353222,4.310382,4.458542,4.318785,4.354511,4.351705,4.353549,...,4.366365,4.36633,4.3664,4.3664,4.366365,4.3664,4.366365,4.366365,4.366365,4.366652
2,4.023842,3.902729,3.933397,3.943695,3.914607,4.015118,3.930438,3.944382,3.938951,3.958272,...,3.947521,3.945681,3.94936,3.94936,3.947521,3.94936,3.947521,3.947521,3.947521,3.950582
3,2.581597,2.417122,2.41647,2.427014,2.379532,2.531922,2.402912,2.423996,2.427861,2.449473,...,2.435897,2.435897,2.435897,2.435897,2.435897,2.435897,2.435897,2.435897,2.435897,2.435897
4,3.686908,3.523907,3.526009,3.539837,3.49383,3.637597,3.495375,3.543491,3.542651,3.526435,...,3.555521,3.555435,3.555606,3.555606,3.555521,3.555606,3.555521,3.555521,3.555521,3.556002
5,3.806627,3.657814,3.619213,3.6086,3.582013,3.721539,3.613708,3.626196,3.620876,3.567195,...,3.636364,3.636364,3.636364,3.636364,3.636364,3.636364,3.636364,3.636364,3.636364,3.636698


## Item Similarity Using TF-IDF

### Understanding TF-IDF

**TF-IDF** is a technique commonly used in natural language processing to evaluate how important a word is to a document within a collection of documents. It helps convert text data (like movie genres) into numerical values, allowing us to compute similarities between items based on their textual features.

#### Components of TF-IDF:

1. **Term Frequency (TF)**: Measures how frequently a term appears in a document. For example, if a term appears 3 times in a document with 100 words, its term frequency is `3/100 = 0.03`.

2. **Inverse Document Frequency (IDF)**: Measures how important a term is across all documents. If a term appears in many documents, its IDF is low, indicating that it is not very important. Conversely, if it appears in very few documents, its IDF is high.

   The formula for IDF is:
   $$
   \text{IDF} = \log\left(\frac{\text{Total number of documents}}{\text{Number of documents containing the term}}\right)
   $$

3. **TF-IDF**: The product of TF and IDF, giving a score that reflects how important a word is to a particular document in a collection. 

   $$
   \text{TF-IDF} = \text{TF} \times \text{IDF}
   $$

### Implementing a Naive Content-Based Filtering

We'll now compute item similarity based on genres more naively, without using the `TfidfVectorizer`. We will:

1. Calculate the **Term Frequency (TF)** for each genre.
2. Compute the **Inverse Document Frequency (IDF)** for each genre.
3. Combine TF and IDF to get **TF-IDF** values for each movie.
4. Use these values to compute cosine similarity.

In [9]:
# Compute TF-IDF values for movie genres
def compute_tf_idf(movies):
    movies['genres'] = movies['genres'].fillna('').apply(lambda x: x.split('|'))

    tf_dict = {}
    for idx, row in movies.iterrows():
        genre_count = len(row['genres'])
        tf_dict[row['movieId']] = {genre: row['genres'].count(genre) / genre_count for genre in row['genres']}

    tf = pd.DataFrame(tf_dict).T.fillna(0)
    all_genres = [genre for sublist in movies['genres'] for genre in sublist]
    genre_counts = pd.Series(all_genres).value_counts()
    total_movies = len(movies)
    idf = np.log(total_movies / genre_counts)
    tf_idf = tf.mul(idf, axis=1)
    return tf_idf

# Compute TF-IDF
tf_idf = compute_tf_idf(movies)
print("TF-IDF Matrix for Movies:")
display(tf_idf.head())


TF-IDF Matrix for Movies:


Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1,0.0,0.0,0.408591,0.553821,0.537184,0.190618,0.0,0.0,0.0,0.505238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.680986,0.0,0.895307,0.0,0.0,0.0,0.0,0.842064,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.476546,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.904473,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.317697,0.0,0.0,0.267915,0.0,0.0,0.0,0.0,0.0,0.0,0.602982,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.953092,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# Compute cosine similarity between movies
item_similarity = cosine_similarity(tf_idf)
item_similarity_df = pd.DataFrame(item_similarity, index=movies['movieId'], columns=movies['movieId'])

print("Item Similarity Matrix (Content-Based Filtering):")
display(item_similarity_df.head())

Item Similarity Matrix (Content-Based Filtering):


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.821262,0.086555,0.080554,0.185686,0.0,0.086555,0.657453,0.0,0.261682,...,0.409478,0.518107,0.141949,0.53949,0.0,0.691551,0.753495,0.0,0.461742,0.185686
2,0.821262,1.0,0.0,0.0,0.0,0.0,0.0,0.80054,0.0,0.318634,...,0.0,0.0,0.0,0.0,0.0,0.359255,0.391434,0.0,0.0,0.0
3,0.086555,0.0,1.0,0.930677,0.466135,0.0,1.0,0.0,0.0,0.0,...,0.108876,0.0,0.356342,0.0,0.0,0.105428,0.114871,0.0,0.0,0.466135
4,0.080554,0.0,0.930677,1.0,0.433821,0.0,0.930677,0.0,0.0,0.0,...,0.101328,0.101979,0.567487,0.0,0.0,0.098119,0.106908,0.365843,0.0,0.433821
5,0.185686,0.0,0.466135,0.433821,1.0,0.0,0.466135,0.0,0.0,0.0,...,0.233571,0.0,0.76446,0.0,0.0,0.226174,0.246433,0.0,0.0,1.0


## Recommend Items for a Specific User

In [11]:
# Function to recommend items
def recommend_items(user_id, user_item_matrix, item_similarity, num_recommendations=5):
    user_ratings = user_item_matrix.loc[user_id]
    user_ratings = user_ratings.reindex(item_similarity.index)
    scores = item_similarity.dot(user_ratings.fillna(0)).div(item_similarity.sum(axis=1))
    scores = scores[user_ratings.isna()]
    recommended_items = scores.nlargest(num_recommendations).index.tolist()
    return recommended_items

# Recommend items for a specific user
user_id = 1  # Example user ID
recommendations = recommend_items(user_id, user_item_matrix, item_similarity_df)
print(f"Recommended items for User {user_id}: {recommendations}")


Recommended items for User 1: [941, 2477, 2537, 4988, 5357]


## Predict Ratings Using SVD

### Predicting Ratings Using SVD (Matrix Factorization)

#### **1. What is Matrix Factorization?**

**Matrix Factorization** is a class of collaborative filtering algorithms used to uncover the latent features that explain observed user-item interactions (like ratings). It works by decomposing the user-item matrix into lower-dimensional matrices that capture the underlying structure of the data.

Matrix factorization is particularly effective because it can handle large-scale data and is capable of discovering hidden relationships between users and items. One of the most popular matrix factorization techniques in recommendation systems is **Singular Value Decomposition (SVD)**.

#### **2. What is Singular Value Decomposition (SVD)?**

**Singular Value Decomposition (SVD)** is a mathematical technique used to decompose a matrix into three smaller matrices:

$$
R = U \Sigma V^T
$$

where:
- \(R\) is the original user-item matrix (with users as rows and items as columns).
- \(U\) is an orthogonal matrix representing users' latent factors.
- \(\Sigma\) is a diagonal matrix containing singular values that indicate the importance of each latent feature.
- \(V^T\) (transpose of \(V\)) is an orthogonal matrix representing items' latent factors.

The decomposition expresses the original matrix \(R\) as a product of the three matrices, capturing the underlying patterns in the data.

#### **3. How SVD is Used for Recommendations?**

In recommendation systems, SVD is used to predict missing values (unknown ratings) in the user-item matrix by leveraging the latent factors discovered during decomposition. The steps are as follows:

1. **Decompose the User-Item Matrix**:
   - Apply SVD to decompose the original matrix \(R\) into three matrices: \(U\), \(\Sigma\), and \(V^T\).

2. **Reduce Dimensionality**:
   - Truncate the matrices \(U\), \(\Sigma\), and \(V^T\) to keep only the top \(k\) singular values in \(\Sigma\) and their corresponding columns in \(U\) and \(V^T\). This step reduces the noise and keeps only the most significant latent factors:
  $$
   R_k \approx U_k \Sigma_k V_k^T
 $$
   where \(k\) is the number of latent factors.

3. **Reconstruct the Matrix**:
   - Use the truncated matrices to reconstruct an approximation of the original matrix:
   $$
   \hat{R} = U_k \Sigma_k V_k^T
   $$
   where $$\hat{R}$$ represents the predicted user-item matrix with filled-in ratings.

4. **Predict Missing Ratings**:
   - Use the reconstructed matrix \(\hat{R}\) to predict unknown ratings. For user \(u\) and item \(i\), the predicted rating $$\hat{r}_{ui}$$ is obtained from the corresponding entry in the reconstructed matrix.

#### **4. Advantages of Using SVD for Recommendations**

- **Dimensionality Reduction**: SVD reduces the number of features while retaining the most significant information, making computations more efficient and less prone to overfitting.
- **Captures Latent Features**: SVD captures hidden patterns and relationships between users and items, such as user preferences and item characteristics.
- **Improves Prediction Accuracy**: By leveraging the latent factors, SVD can provide more accurate predictions for missing ratings.

#### **5. Challenges of Using SVD**

- **Computational Complexity**: SVD can be computationally expensive for very large matrices, especially when the number of users or items is large.
- **Data Sparsity**: SVD requires a reasonably dense user-item matrix to produce reliable latent factors. In cases of extreme sparsity, SVD may not perform well.
- **Cold Start Problem**: SVD, like other collaborative filtering methods, suffers from the cold start problem, where it cannot generate accurate recommendations for new users or items with no historical data.

#### **6. Implementation Steps in Python**

To implement SVD for predicting ratings, follow these steps:

1. **Prepare the User-Item Matrix**:
   - Fill missing values with zeros (or another suitable value) to perform SVD.

2. **Apply SVD**:
   - Use the `scipy.sparse.linalg.svds` function to compute the matrices $$U_k,\Sigma_k, V_k^T.$$

3. **Reconstruct the Matrix**:
   - Multiply the truncated matrices to get the predicted ratings matrix.

4. **Predict Ratings**:
   - Use the reconstructed matrix to predict the ratings for unseen items.

#### **Conclusion**

Using SVD for matrix factorization is a powerful approach for predicting ratings in recommendation systems. It leverages the hidden patterns in the data to provide personalized recommendations. However, it requires careful handling of data sparsity and computational resources to perform efficiently in large-scale applications.
n provides a comprehensive understanding of the theory behind using SVD for predicting ratings in recommendation systems.

In [15]:
# Function to perform SVD for recommendation
def svd_recommendation(user_item_matrix, k=50):
    """
    Perform Matrix Factorization using Singular Value Decomposition (SVD).
    
    Args:
    - user_item_matrix (DataFrame): The user-item matrix.
    - k (int): Number of singular values and vectors to compute.
    
    Returns:
    - predicted_ratings (DataFrame): Predicted ratings after matrix factorization.
    """
    # Fill NaN values with 0 for SVD
    user_item_matrix_filled = user_item_matrix.fillna(0)

    # Convert to numpy matrix
    user_ratings_matrix = user_item_matrix_filled.values

    # Perform SVD
    u, sigma, vt = svds(user_ratings_matrix, k=k)

    # Convert sigma to diagonal matrix
    sigma = np.diag(sigma)

    # Compute predicted ratings
    predicted_ratings_matrix = np.dot(np.dot(u, sigma), vt)

    # Convert to DataFrame
    predicted_ratings = pd.DataFrame(predicted_ratings_matrix, index=user_item_matrix.index, columns=user_item_matrix.columns)

    return predicted_ratings

# Predict ratings using SVD
predicted_ratings_svd = svd_recommendation(user_item_matrix, k=50)
print("Predicted Ratings (SVD):")
display(predicted_ratings_svd.head())


Predicted Ratings (SVD):


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.181872,0.393674,0.838186,-0.082365,-0.546279,2.521662,-0.887231,-0.025221,0.196969,1.606758,...,-0.024984,-0.021415,-0.028553,-0.028553,-0.024984,-0.028553,-0.024984,-0.024984,-0.024984,-0.058988
2,0.209809,0.004821,0.030742,0.017252,0.183764,-0.06066,0.083306,0.023797,0.0481,-0.151968,...,0.018895,0.016196,0.021594,0.021594,0.018895,0.021594,0.018895,0.018895,0.018895,0.031966
3,0.013394,0.034726,0.050525,0.0002,-0.005577,0.114673,-0.007461,0.000738,0.004747,-0.061284,...,-0.001612,-0.001382,-0.001843,-0.001843,-0.001612,-0.001843,-0.001612,-0.001612,-0.001612,-0.00053
4,2.012072,-0.394882,-0.290386,0.093864,0.123312,0.259765,0.472667,0.035965,0.011293,-0.021983,...,0.001966,0.001685,0.002247,0.002247,0.001966,0.002247,0.001966,0.001966,0.001966,-0.021462
5,1.336714,0.772954,0.064577,0.11388,0.274994,0.58448,0.251048,0.131534,-0.08631,1.035361,...,-0.004407,-0.003778,-0.005037,-0.005037,-0.004407,-0.005037,-0.004407,-0.004407,-0.004407,-0.006099
