In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
ratings_df = pd.read_csv('./data/Dataset.csv')
movies_df = pd.read_csv('./data/Movie_Id_Titles.csv')

In [3]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100003 entries, 0 to 100002
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    100003 non-null  int64
 1   item_id    100003 non-null  int64
 2   rating     100003 non-null  int64
 3   timestamp  100003 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB


In [4]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   item_id  1682 non-null   int64 
 1   title    1682 non-null   object
dtypes: int64(1), object(1)
memory usage: 26.4+ KB


In [5]:
# Check the structure of the datasets
ratings_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742


In [6]:
movies_df.head()

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [7]:
# Merge the ratings and movies dataframes to get movie titles along with user ratings
movie_ratings = pd.merge(ratings_df, movies_df, left_on='item_id', right_on='item_id')
movie_ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,0,50,5,881250949,Star Wars (1977)
1,290,50,5,880473582,Star Wars (1977)
2,79,50,4,891271545,Star Wars (1977)
3,2,50,5,888552084,Star Wars (1977)
4,8,50,5,879362124,Star Wars (1977)


In [9]:
# Create a user-movie rating matrix using pivot_table
user_movie_ratings = movie_ratings.pivot_table(index='user_id', columns='title', values='rating')

In [11]:
# Fill NaN values (unrated movies) with 0
user_movie_ratings = user_movie_ratings.fillna(0)

In [12]:
user_movie_ratings

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Print the user-movie rating matrix
user_movie_ratings.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from scipy.sparse.linalg import svds

In [16]:
# Split the user-movie rating matrix into training and testing sets
train, test = train_test_split(user_movie_ratings, test_size=0.2, random_state=42)

In [17]:
# Function to calculate user-based collaborative filtering recommendations
def user_based_collaborative_filtering(user_movie_ratings, user_id, num_recommendations=5):
    user_ratings = user_movie_ratings.loc[user_id].dropna()
    similar_users = user_movie_ratings.corrwith(user_ratings, axis=1).sort_values(ascending=False)
    similar_users = similar_users.dropna()
    
    # Reset the index of user_movie_ratings
    user_movie_ratings_reset = user_movie_ratings.reset_index()
    
    # Join with the mean ratings of users based on index (user IDs)
    recommendations = pd.DataFrame(similar_users, columns=['correlation'])
    recommendations = recommendations.join(user_movie_ratings_reset.set_index('user_id').mean(axis=1).rename('mean_rating'))
    recommendations = recommendations.sort_values(by=['correlation', 'mean_rating'], ascending=False)
    return recommendations.head(num_recommendations)

In [18]:
user_movie_ratings

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
import pandas as pd

# Function for item-based collaborative filtering
def item_based_collaborative_filtering(user_movie_ratings, user_id, num_recommendations=5):
    # Get the target user's ratings
    user_ratings = user_movie_ratings.loc[user_id]
    
    # Check if the user has not rated any movies
    if user_ratings.dropna().empty:
        return "This user has not rated any movies, so item-based recommendations are not possible."
    
    # Calculate the correlation between the target user's ratings and all other movies
    similar_movies = user_movie_ratings.corrwith(user_ratings, axis=0).sort_values(ascending=False)
    similar_movies = similar_movies.dropna()
    
    # Create a DataFrame for recommendations
    recommendations = pd.DataFrame(similar_movies, columns=['correlation'])
    
    # Calculate the mean ratings of movies based on 'title'
    mean_ratings = user_movie_ratings.mean(axis=0)
    
    # Convert mean_ratings to a DataFrame and reset the index
    mean_ratings_df = mean_ratings.reset_index()
    
    # Join recommendations with the mean ratings
    recommendations = recommendations.join(mean_ratings_df, on='title', rsuffix='_mean')
    recommendations.columns = ['title', 'correlation', 'mean_rating']

    # Sort the recommendations by correlation and mean rating
    recommendations = recommendations.sort_values(by=['correlation', 'mean_rating'], ascending=False)
    
    return recommendations.head(num_recommendations)

In [20]:
# Perform Singular Value Decomposition (SVD) on the dense matrix 'train'
U, sigma, Vt = np.linalg.svd(train)

In [21]:
k = 20
U = U[:, :k]
sigma = np.diag(sigma[:k])
Vt = Vt[:k, :]

In [22]:
# Predict user ratings using SVD
predicted_ratings = np.dot(np.dot(U, sigma), Vt)

In [23]:
# Create a DataFrame for predicted ratings
predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=train.columns)

In [24]:
# Function to make SVD-based recommendations
def svd_recommender(user_id, num_recommendations=5):
    user_ratings = predicted_ratings_df.loc[user_id]
    sorted_user_ratings = user_ratings.sort_values(ascending=False)
    user_rated_movies = train.loc[user_id]
    recommendations = sorted_user_ratings[~sorted_user_ratings.index.isin(user_rated_movies.index)]
    return recommendations.head(num_recommendations)

In [25]:
# Function to calculate MAE for SVD-based predictions
def calculate_mae(test_data, predicted_data):
    return mean_absolute_error(test_data, predicted_data)

In [26]:
# Example: Get user-based collaborative filtering recommendations for user with userId=0
user_id = 4
user_based_recommendations = user_based_collaborative_filtering(user_movie_ratings, user_id)
print(f"\nUser-Based Collaborative Filtering Recommendations for User {user_id}:")
print(user_based_recommendations)

# Example: Get item-based collaborative filtering recommendations for user with userId=0
item_based_recommendations = item_based_collaborative_filtering(user_movie_ratings, user_id)
print(f"\nItem-Based Collaborative Filtering Recommendations for User {user_id}:")
print(item_based_recommendations)

# Example: Get SVD-based recommendations for user with userId=0
svd_recommendations = svd_recommender(user_id)
print(f"\nSVD Recommendations for User {user_id}:")
print(svd_recommendations)


User-Based Collaborative Filtering Recommendations for User 4:
         correlation  mean_rating
user_id                          
4           1.000000     0.062500
750         0.387308     0.060096
570         0.367479     0.035457
451         0.364961     0.157752
509         0.351105     0.049279

Item-Based Collaborative Filtering Recommendations for User 4:
Empty DataFrame
Columns: [title, correlation, mean_rating]
Index: []

SVD Recommendations for User 4:
Series([], Name: 4, dtype: float64)
