In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error

In [5]:
# Sample dataset
movies = pd.DataFrame({
    'movie_id': [1, 2, 3, 4],
    'title': ['Movie A', 'Movie B', 'Movie C', 'Movie D'],
    'genre': ['Action|Adventure', 'Action|Comedy', 'Comedy|Drama', 'Drama|Thriller']
})

ratings = pd.DataFrame({
    'user_id': [1, 1, 2, 2, 3, 3],
    'movie_id': [1, 2, 2, 3, 3, 4],
    'rating': [5, 4, 4, 3, 5, 4]
})

In [6]:
movies

Unnamed: 0,movie_id,title,genre
0,1,Movie A,Action|Adventure
1,2,Movie B,Action|Comedy
2,3,Movie C,Comedy|Drama
3,4,Movie D,Drama|Thriller


In [7]:
ratings

Unnamed: 0,user_id,movie_id,rating
0,1,1,5
1,1,2,4
2,2,2,4
3,2,3,3
4,3,3,5
5,3,4,4


# Content Filtering

In [8]:
user_id = 1
movie_id = 1

In [9]:

user_ratings = ratings[ratings['user_id'] == user_id]
print(user_ratings)

   user_id  movie_id  rating
0        1         1       5
1        1         2       4


In [10]:
user_movies = movies[movies['movie_id'].isin(user_ratings['movie_id'])]
print(user_movies)

   movie_id    title             genre
0         1  Movie A  Action|Adventure
1         2  Movie B     Action|Comedy


In [11]:
# TF-IDF Vectorizer of genres
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(movies['genre'])
print(tfidf_matrix)
print(tfidf.get_feature_names_out())

  (0, 1)	0.7852882757103967
  (0, 0)	0.6191302964899972
  (1, 2)	0.7071067811865475
  (1, 0)	0.7071067811865475
  (2, 3)	0.7071067811865475
  (2, 2)	0.7071067811865475
  (3, 4)	0.7852882757103967
  (3, 3)	0.6191302964899972
['action' 'adventure' 'comedy' 'drama' 'thriller']


In [12]:
cosine_similarity_matrix = cosine_similarity(tfidf_matrix) #each column is a movie genre
print(cosine_similarity_matrix)

[[1.         0.43779123 0.         0.        ]
 [0.43779123 1.         0.5        0.        ]
 [0.         0.5        1.         0.43779123]
 [0.         0.         0.43779123 1.        ]]


In [13]:
sim_scores = cosine_similarity_matrix[user_movies.index].mean(axis=0) #row-wise mean
print(sim_scores)

[0.71889562 0.71889562 0.25       0.        ]


In [14]:
# recomendation based on similarity textual by genre
movies.iloc[np.argsort(-sim_scores)]

Unnamed: 0,movie_id,title,genre
1,2,Movie B,Action|Comedy
0,1,Movie A,Action|Adventure
2,3,Movie C,Comedy|Drama
3,4,Movie D,Drama|Thriller


# Collaborative Filtering

In [15]:
# Create user-item matrix
user_item_matrix = ratings.pivot_table(index='user_id', columns='movie_id', values='rating')
print(user_item_matrix)

movie_id    1    2    3    4
user_id                     
1         5.0  4.0  NaN  NaN
2         NaN  4.0  3.0  NaN
3         NaN  NaN  5.0  4.0


In [24]:
user_item_matrix = user_item_matrix.fillna(0) #fill NaN with 0 to make cos sim works

In [16]:
# Latent Features
user_similarity = cosine_similarity(user_item_matrix.fillna(0))
print(user_similarity)

[[1.         0.49975604 0.        ]
 [0.49975604 1.         0.46852129]
 [0.         0.46852129 1.        ]]


In [18]:
similarity_vector = user_similarity[user_id - 1] # for the user_id = 1
print(similarity_vector)

[1.         0.49975604 0.        ]


In [25]:
user_ratings = user_item_matrix.values
print(user_ratings)

[[5. 4. 0. 0.]
 [0. 4. 3. 0.]
 [0. 0. 5. 4.]]


In [39]:
# Prediction: using users cosine similarity between their ratings, and then make a dot product with the user ratings
yhat_ratings = np.dot(similarity_vector, user_ratings) / np.sum(np.abs(similarity_vector)) #divide to normalize ratings
print(yhat_ratings)

[3.33387556 4.         0.99967466 0.        ]


In [43]:
np.argsort(-yhat_ratings) # order of the movies

array([1, 0, 2, 3])

In [45]:
#order the movies based on the predicted ratings for the user_id = 1
movies.iloc[np.argsort(-yhat_ratings)]

Unnamed: 0,movie_id,title,genre
1,2,Movie B,Action|Comedy
0,1,Movie A,Action|Adventure
2,3,Movie C,Comedy|Drama
3,4,Movie D,Drama|Thriller
