In [1]:
# The first step is to load the dataset and get an Idea of what it is about  

In [2]:
import pandas as pd

# Loading movies and ratings data
movies = pd.read_csv('movies.csv')  # Replace with the correct path
ratings = pd.read_csv('ratings.csv')

# Exploring the datasets
print(movies.head())
print(ratings.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [3]:
# Now lets handle the missing values

In [4]:
print(movies.isnull().sum())
print(ratings.isnull().sum())

movieId    0
title      0
genres     0
dtype: int64
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64


In [5]:
# There are none

In [6]:
# Splitting genres into lists
movies['genres'] = movies['genres'].str.split('|')

In [7]:
# Merging some data to reduce redundancy
data = pd.merge(ratings, movies, on='movieId')
print(data.head())


   userId  movieId  rating  timestamp                        title  \
0       1        1     4.0  964982703             Toy Story (1995)   
1       1        3     4.0  964981247      Grumpier Old Men (1995)   
2       1        6     4.0  964982224                  Heat (1995)   
3       1       47     5.0  964983815  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0  964982931   Usual Suspects, The (1995)   

                                              genres  
0  [Adventure, Animation, Children, Comedy, Fantasy]  
1                                  [Comedy, Romance]  
2                          [Action, Crime, Thriller]  
3                                [Mystery, Thriller]  
4                         [Crime, Mystery, Thriller]  


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Converting genres list back to strings for vectorization
movies['genres_str'] = movies['genres'].apply(lambda x: ' '.join(x))

# Vectorizing genres
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres_str'])

In [9]:
# You might be wondering what I did there ,I am too let me wrap my head around it

In [10]:
# This part will calculate similarity
from sklearn.metrics.pairwise import linear_kernel

# Computing cosine similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [11]:
# This is the actual recommendation system

# Creating a movie-title to index mapping
movie_indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

def recommend_content_based(title, cosine_sim=cosine_sim):
    idx = movie_indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Top 10 recommendations
    movie_indices_rec = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices_rec]

# Example: Recommend similar movies
print(recommend_content_based("Toy Story (1995)"))


1706                                          Antz (1998)
2355                                   Toy Story 2 (1999)
2809       Adventures of Rocky and Bullwinkle, The (2000)
3000                     Emperor's New Groove, The (2000)
3568                                Monsters, Inc. (2001)
6194                                     Wild, The (2006)
6486                               Shrek the Third (2007)
6948                       Tale of Despereaux, The (2008)
7760    Asterix and the Vikings (Astérix et les Viking...
8219                                         Turbo (2013)
Name: title, dtype: object


In [12]:
# That was cool ,let's do that again

In [13]:
# Blue Sky (1994)
print(recommend_content_based("Blue Sky (1994)"))

24                            Leaving Las Vegas (1995)
27                                   Persuasion (1995)
42                How to Make an American Quilt (1995)
45                        When Night Is Falling (1995)
66                                 Bed of Roses (1996)
75     Once Upon a Time... When We Were Colored (1995)
76                           Angels and Insects (1995)
93               Bridges of Madison County, The (1995)
115                       Up Close and Personal (1996)
151                                    Mad Love (1995)
Name: title, dtype: object


In [14]:
# Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
print(recommend_content_based("Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)"))

53                     Indian in the Cupboard, The (1995)
109                     NeverEnding Story III, The (1994)
767                       Escape to Witch Mountain (1975)
1514            Darby O'Gill and the Little People (1959)
1556                                  Return to Oz (1985)
1617                        NeverEnding Story, The (1984)
1618    NeverEnding Story II: The Next Chapter, The (1...
1799                        Santa Claus: The Movie (1985)
3574    Harry Potter and the Sorcerer's Stone (a.k.a. ...
6075    Chronicles of Narnia: The Lion, the Witch and ...
Name: title, dtype: object


In [15]:
# This creates a user matrix
user_movie_matrix = data.pivot(index='userId', columns='movieId', values='rating')

In [16]:
# Filling NaN with 0;albeit there probably isn't one
user_movie_matrix = user_movie_matrix.fillna(0)

In [17]:
# This performes singular value decomposition
from scipy.sparse.linalg import svds

# Performing SVD
U, sigma, Vt = svds(user_movie_matrix, k=50)

# Converting sigma to diagonal matrix
sigma = np.diag(sigma)


TypeError: type not understood

In [18]:
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import numpy as np

# Converting the user-item matrix to a sparse matrix
sparse_matrix = csr_matrix(user_movie_matrix)

# Performing SVD
U, sigma, Vt = svds(sparse_matrix, k=50)

# Converting sigma to a diagonal matrix
sigma = np.diag(sigma)


In [19]:
# Making Predictions

# Reconstructing the matrix
predicted_ratings = np.dot(np.dot(U, sigma), Vt)

# Converting to a DataFrame
predicted_df = pd.DataFrame(predicted_ratings, index=user_movie_matrix.index, columns=user_movie_matrix.columns)


In [20]:
# Recommendation system based on ratings

def recommend_collaborative(user_id, num_recommendations=10):
    # Getting the predicted ratings for the user
    user_predictions = predicted_df.loc[user_id]

    # Getting movies already rated by the user
    already_rated = user_movie_matrix.loc[user_id][user_movie_matrix.loc[user_id] > 0].index

    # Excluding already rated movies and sorting by predicted rating
    recommendations = user_predictions.drop(already_rated).sort_values(ascending=False).head(num_recommendations)

    # Getting movie titles for the recommended movies
    recommended_movies = movies[movies['movieId'].isin(recommendations.index)]['title']
    return recommended_movies

In [21]:
# Example: Recommend movies for user ID 234
user_id = 234
recommended_movies = recommend_collaborative(user_id=user_id)
print(f"Recommended movies for User ID {user_id}:")
print(recommended_movies)


Recommended movies for User ID 234:
325                           Mask, The (1994)
483     Nightmare Before Christmas, The (1993)
512                Beauty and the Beast (1991)
513                           Pinocchio (1940)
592                           Rock, The (1996)
615       Independence Day (a.k.a. ID4) (1996)
981                            Fantasia (1940)
1543                   Jungle Book, The (1967)
1628                        Beetlejuice (1988)
2078                   Sixth Sense, The (1999)
Name: title, dtype: object


In [22]:
# Model Evaluation

from sklearn.metrics import mean_squared_error

# Flatten actual and predicted matrices for comparison
actual = user_movie_matrix.values.flatten()
predicted = predicted_df.values.flatten()

# Calculate RMSE
rmse = mean_squared_error(actual, predicted, squared=False)
print(f"RMSE: {rmse}")

RMSE: 0.30609639616977785




In [23]:
import warnings

# Suppress all FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [24]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Replace NaN values in the original matrix with zeros
actual = user_movie_matrix.fillna(0).values.flatten()
predicted = np.nan_to_num(predicted_df.values.flatten())  # Replace NaNs in predictions with zeros

# Calculate RMSE
rmse = mean_squared_error(actual, predicted, squared=False)
print(f"RMSE: {rmse}")


RMSE: 0.30609639616977785


In [25]:
print("Actual shape:", actual.shape)
print("Predicted shape:", predicted.shape)
print("Any NaN in actual?", np.isnan(actual).any())
print("Any NaN in predicted?", np.isnan(predicted).any())

Actual shape: (5931640,)
Predicted shape: (5931640,)
Any NaN in actual? False
Any NaN in predicted? False
