In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
movies = pd.read_csv('/content/merged_movielens.csv')

In [3]:
movies.shape

(1000209, 10)

In [4]:
movies.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres,Gender,Age,Occupation,Zip-code
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,1,10,48067


In [5]:
num_duplicates = movies.duplicated().sum()
print("Number of duplicate rows:", num_duplicates)

Number of duplicate rows: 0


In [6]:
print(movies[movies["MovieID"] == 3159])

        UserID  MovieID  Rating   Timestamp                 Title  \
4247        32     3159       2   978122583  Fantasia 2000 (1999)   
5399        37     3159       5   978056151  Fantasia 2000 (1999)   
7271        51     3159       5   977947337  Fantasia 2000 (1999)   
10474       75     3159       5   977851099  Fantasia 2000 (1999)   
10755       78     3159       4   978570374  Fantasia 2000 (1999)   
...        ...      ...     ...         ...                   ...   
988166    5964     3159       1   956997144  Fantasia 2000 (1999)   
989130    5972     3159       3   966265800  Fantasia 2000 (1999)   
991788    5991     3159       3  1000092704  Fantasia 2000 (1999)   
995847    6015     3159       4   956778826  Fantasia 2000 (1999)   
998949    6036     3159       5   956709739  Fantasia 2000 (1999)   

                              Genres Gender  Age  Occupation Zip-code  
4247    Animation|Children's|Musical      F   25           0    19355  
5399    Animation|Children'

In [7]:
movies = movies.drop_duplicates(subset="MovieID").reset_index(drop=True)
movie_ids = movies["MovieID"].tolist()


In [8]:
movies["Genres"] = movies["Genres"].apply(lambda x: x.replace('|',' '))
movies["Content"] = movies["Title"] + ' ' + movies["Genres"]

In [9]:
tfidf = TfidfVectorizer(stop_words='english')
content_matrix = tfidf.fit_transform(movies['Content'])

In [10]:
assert len(movie_ids) == content_matrix.shape[0], "Mismatch between movie IDs and TF-IDF matrix"


In [11]:
rating_df = pd.read_csv('/content/CF_data.csv')

In [12]:
#Helper funtion
def get_user_build(rating_df,user_id,movie_ids,content_matrix):
  user_data = rating_df[rating_df["UserID"] == user_id]
  user_data = user_data[user_data["Rating"] >= 4]
  indices = user_data["MovieID"].apply(lambda x:movie_ids.index(x)).tolist()
  tidf_vector = content_matrix[indices]
  ratings = user_data["Rating"].values.reshape(-1,1)
  user_profile = tidf_vector.multiply(ratings).mean(axis=0)
  user_profile = np.asarray(user_profile)
  return user_profile


In [13]:
user_profile = get_user_build(rating_df,user_id = 1,movie_ids = movies["MovieID"].tolist(),content_matrix=content_matrix)
scores = cosine_similarity(user_profile.reshape(1,-1),content_matrix).flatten()

In [14]:


print("Unique similarity scores:", np.unique(scores))
print("Shape of scores:", scores.shape)


Unique similarity scores: [0.         0.00626179 0.00632714 ... 0.50728184 0.51288385 0.51854419]
Shape of scores: (3706,)


In [15]:
# Recommending top 5 movies for user 1
user_id = 1

user_rated_movies = rating_df[rating_df["UserID"]== user_id]["MovieID"].tolist()
top_n = scores.argsort()[::-1]

top_5_recommendations = []
seen = set(user_rated_movies)

for i in top_n:
  if movie_ids[i] not in seen:
    top_5_recommendations.append(movie_ids[i])
  if len(top_5_recommendations) == 5:
    break

print(top_5_recommendations)


[1688, 3159, 2081, 2559, 631]


In [16]:
for movie_id in top_5_recommendations:
    title = movies[movies['MovieID'] == movie_id]['Title'].values[0]
    print(f"{title} (ID: {movie_id})")

Anastasia (1997) (ID: 1688)
Fantasia 2000 (1999) (ID: 3159)
Little Mermaid, The (1989) (ID: 2081)
King and I, The (1999) (ID: 2559)
All Dogs Go to Heaven 2 (1996) (ID: 631)


In [18]:
cbf_score = scores

cbf_df = pd.DataFrame({
    'MovieID': movie_ids,
    'cbf_score': cbf_score
})

cbf_df.head()

Unnamed: 0,MovieID,cbf_score
0,1193,0.13094
1,661,0.336557
2,914,0.15126
3,3408,0.119153
4,2355,0.430692


In [19]:
cbf_df.to_csv('/content/cbf_scores_user_1.csv')