In [3]:
!pip install pandas scikit-learn surprise


Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[K     |████████████████████████████████| 771 kB 23.5 MB/s eta 0:00:01
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25ldone
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp38-cp38-linux_x86_64.whl size=3366956 sha256=95164681ea9ae084b23c33c92c5dad6219e1d08758a73b547a58ad9b6e327c8d
  Stored in directory: /home/hduser/.cache/pip/wheels/af/db/86/2c18183a80ba05da35bf0fb7417aac5cddbd93bcb1b92fd3ea
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [7]:
import pandas as pd

movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")


In [8]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1.147880e+09
1,1,306,3.5,1.147869e+09
2,1,307,5.0,1.147869e+09
3,1,665,5.0,1.147879e+09
4,1,899,3.5,1.147869e+09
...,...,...,...,...
1144865,7721,1544,3.0,9.442212e+08
1144866,7721,1552,5.0,9.441464e+08
1144867,7721,1573,5.0,9.441464e+08
1144868,7721,1580,3.0,9.441459e+08


In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
movies['clean_genres'] = movies['genres'].str.replace('\|', ' ', regex=True)

In [11]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['clean_genres'])

In [13]:
# Create a matrix of zeros of shape (num_users, num_genres_features)
user_profiles = pd.DataFrame(index=ratings['userId'].unique(), columns=tfidf_vectorizer.get_feature_names_out(), data=0.0)

for index, row in ratings.iterrows():
    # Get the index of the movie in the original movies DataFrame
    movie_idx = movies.index[movies['movieId'] == row['movieId']].tolist()[0]
    
    # Add the weighted genres to the user's profile
    user_profiles.loc[row['userId']] += tfidf_matrix[movie_idx].toarray().flatten() * row['rating']

# Normalize the user profiles
user_profiles = user_profiles.div(user_profiles.sum(axis=1), axis=0)


In [14]:
def recommend_movies(user_id, user_profiles, tfidf_matrix, movies, top_n=10):
    # Compute cosine similarity between user profile and all movie genre vectors
    user_vector = user_profiles.loc[user_id].values.reshape(1, -1)
    cosine_sim = cosine_similarity(user_vector, tfidf_matrix)

    # Get indices of the top_n most similar movies
    top_movie_indices = cosine_sim.argsort().flatten()[-top_n:][::-1]

    # Fetch the movie titles based on the indices
    recommended_movies = movies.iloc[top_movie_indices]
    return recommended_movies[['title', 'genres']]


In [17]:
user_id = 1  # Assuming this is a valid user ID in your dataset
recommended_movies = recommend_movies(user_id, user_profiles, tfidf_matrix, movies, top_n=10)
print("Recommended movies for user", user_id, ":\n", recommended_movies)


Recommended movies for user 1 :
                                    title genres
16722  Brink of Life (Nära livet) (1958)  Drama
17575                      Aurora (2010)  Drama
50513           Closed For Winter (2009)  Drama
50510       An Ordinary Execution (2010)  Drama
17555              Chak De India! (2007)  Drama
17558             Cow, The (Gaav) (1969)  Drama
17560                 Local Color (1977)  Drama
17570              Iron Lady, The (2011)  Drama
50498              Untold Scandal (2003)  Drama
50496                    U-Carmen (2005)  Drama
