<a href="https://colab.research.google.com/github/ParhamPishro/Solutions-of-FUM-Academic-Exercises/blob/main/Deep%20Learning/9_Recommender_System_for_Movie_using_Node2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install node2vec

Collecting node2vec
  Downloading node2vec-0.5.0-py3-none-any.whl.metadata (849 bytes)
Downloading node2vec-0.5.0-py3-none-any.whl (7.2 kB)
Installing collected packages: node2vec
Successfully installed node2vec-0.5.0


In [None]:
import os
from urllib.request import urlopen
from zipfile import ZipFile
from io import BytesIO
from collections import defaultdict
import pandas as pd
import networkx as nx
from node2vec import Node2Vec

In [None]:
# Step 1: Download and Extract Data
url = 'https://files.grouplens.org/datasets/movielens/ml-100k.zip'
var_directory = 'var'

if not os.path.exists(var_directory):
    os.makedirs(var_directory)

with urlopen(url) as zurl:
    with ZipFile(BytesIO(zurl.read())) as zfile:
        zfile.extractall(var_directory)

print(f'Data has been downloaded and extracted to the {var_directory} directory.')

# Step 2: Load Data
ratings = pd.read_csv('var/ml-100k/u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'unix_timestamp'])
movies = pd.read_csv('var/ml-100k/u.item', sep='|', usecols=range(2), names=['movie_id', 'title'], encoding='latin-1')

# Step 3: Filter Ratings
ratings = ratings[ratings.rating >= 4]

# Step 4: Build Co-occurrence Pairs Without groupby
pairs = defaultdict(int)
unique_users = ratings['user_id'].unique()

for user_id in unique_users:
    user_movies = ratings[ratings['user_id'] == user_id]['movie_id'].values
    for i in range(len(user_movies)):
        for j in range(i + 1, len(user_movies)):
            pairs[(user_movies[i], user_movies[j])] += 1




Data has been downloaded and extracted to the var directory.


In [None]:
movies

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [None]:
# Get all movie IDs
all_movie_ids = set(movies['movie_id'])

# Get movie IDs included in the graph
graph_movie_ids = set(G.nodes)

# Find excluded movies
excluded_movies = all_movie_ids - graph_movie_ids
print(f"Number of excluded movies: {len(excluded_movies)}")


Number of excluded movies: 235


In [None]:
# Step 5: Create Graph
G = nx.Graph()

for pair, count in pairs.items():
    movie1, movie2 = pair
    if count >= 20:  # Threshold for co-occurrence
        G.add_edge(movie1, movie2, weight=count)

print("Total number of graph nodes:", G.number_of_nodes())
print("Total number of graph edges:", G.number_of_edges())

# Step 6: Train Node2Vec Model
node2vec = Node2Vec(G, dimensions=64, walk_length=20, num_walks=200, p=2, q=1, workers=1)
model = node2vec.fit(window=10, min_count=1, batch_words=4)


Total number of graph nodes: 410
Total number of graph edges: 14936


Computing transition probabilities:   0%|          | 0/410 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 200/200 [00:45<00:00,  4.39it/s]


In [None]:
sorted_array = sorted(unique_users)
print(sorted_array)


[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 22

In [None]:
# step7
# Filter out movies the user has already rated
def recommend_for_user(user_id, top_n=10):
    """
    Recommend movies for a user based on their preferences using Node2Vec.

    Parameters:
        user_id (int): The ID of the user to recommend movies for.
        top_n (int): Number of recommendations to return.
    """
    # Get the user's rated movies
    user_ratings = ratings[ratings.user_id == user_id]
    highly_rated_movies = user_ratings['movie_id'].values  # All rated movies

    if len(highly_rated_movies) == 0:
        print(f"User {user_id} has no rated movies.")
        return

    # Aggregate embeddings for the user's highly rated movies
    user_vector = None
    for movie_id in highly_rated_movies:
        str_movie_id = str(movie_id)
        if str_movie_id in model.wv:
            if user_vector is None:
                user_vector = model.wv[str_movie_id].copy()
            else:
                user_vector += model.wv[str_movie_id]

    if user_vector is None:
        print(f"No embeddings found for User {user_id}'s rated movies.")
        return

    # Find similar movies using aggregated embedding
    similar_movies = model.wv.similar_by_vector(user_vector, topn=top_n + len(highly_rated_movies))

    # Filter out movies the user has already rated
    recommended_movies = []
    for movie_id, similarity in similar_movies:
        int_movie_id = int(movie_id)
        if int_movie_id not in highly_rated_movies:
            title = movies[movies.movie_id == int_movie_id].title.values[0]
            recommended_movies.append((title, similarity))
        if len(recommended_movies) == top_n:
            break

    # Display recommendations
    print(f"Recommendations for User {user_id}:")
    for title, similarity in recommended_movies:
        print(f"{title}: {similarity:.2f}")


In [None]:
# Step 8: Test the Recommendation System
recommend_for_user(user_id=403, top_n=5)

Recommendations for User 403:
Ransom (1996): 0.68
Men in Black (1997): 0.66
Lost World: Jurassic Park, The (1997): 0.65
Sleepers (1996): 0.64
Sabrina (1995): 0.64


In [None]:
for i in sorted_array:
  recommend_for_user(user_id = i)

Recommendations for User 1:
Fish Called Wanda, A (1988): 0.66
Recommendations for User 2:
Amistad (1997): 0.68
Recommendations for User 3:
Amistad (1997): 0.83
Recommendations for User 4:
Fly Away Home (1996): 0.69
Recommendations for User 5:
Terminator, The (1984): 0.69
Recommendations for User 6:
Birds, The (1963): 0.71
Recommendations for User 7:
Groundhog Day (1993): 0.65
Recommendations for User 8:
Terminator 2: Judgment Day (1991): 0.74
Recommendations for User 9:
To Kill a Mockingbird (1962): 0.62
Recommendations for User 10:
Godfather: Part II, The (1974): 0.71
Recommendations for User 11:
When Harry Met Sally... (1989): 0.73
Recommendations for User 12:
Shawshank Redemption, The (1994): 0.70
Recommendations for User 13:
When Harry Met Sally... (1989): 0.65
Recommendations for User 14:
Shawshank Redemption, The (1994): 0.69
Recommendations for User 15:
Amistad (1997): 0.65
Recommendations for User 16:
Dances with Wolves (1990): 0.72
Recommendations for User 17:
Lone Star (1996)

Using movies as graph nodes is suitable for recommending movies to users because it models the general relationships between movies based on the users shared ratings. By aggregating the embeddings of movies that a user has rated, a "user preference vector" can be created and similar movies can be recommended. This approach is simple and effective for general recommendations, especially on large datasets. However, it may not provide strong personalization and may be biased towards popular movies. If personalization is of high importance, using a bipartite graph that includes both users and movies as nodes better reflects each user's specific preferences, but increases the complexity of the system. For most cases, using movies as nodes is a good option for general and scalable recommendations.

## second method

In [None]:
import networkx as nx

# Create a bipartite graph
G = nx.Graph()

# Add edges between users and movies based on ratings
for _, row in ratings.iterrows():
    user_node = f"user_{row['user_id']}"  # Prefix user IDs
    movie_node = row['movie_id']
    if row['rating'] >= 4:  # Include only high ratings
        G.add_edge(user_node, movie_node, weight=row['rating'])  # Weight is the rating

# Print graph stats
print(f"Total number of nodes: {G.number_of_nodes()}")
print(f"Total number of edges: {G.number_of_edges()}")


Total number of nodes: 2389
Total number of edges: 55375


In [None]:
# Train Node2Vec
node2vec = Node2Vec(G, dimensions=64, walk_length=20, num_walks=200, p=2, q=1, workers=1)
model = node2vec.fit(window=10, min_count=1, batch_words=4)


Computing transition probabilities:   0%|          | 0/2389 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 200/200 [03:28<00:00,  1.04s/it]


In [None]:
def recommend_for_user_bipartite(user_id, top_n=30):
    user_node = f"user_{user_id}"
    if user_node not in model.wv:
        print(f"No embedding found for User {user_id}.")
        return

    # Find similar items to the user node
    similar_items = model.wv.most_similar(user_node, topn=top_n)

    print(f"Recommendations for User {user_id}:")
    for item_id, similarity in similar_items:
        if item_id.startswith("user_"):
            continue  # Skip user nodes
        title = movies[movies.movie_id == int(item_id)].title.values[0]
        print(f"{title}: {similarity:.2f}")


In [None]:
recommend_for_user_bipartite(user_id=50)


Recommendations for User 50:
I Shot Andy Warhol (1996): 0.65
Basquiat (1996): 0.63
Pillow Book, The (1995): 0.62
Anne Frank Remembered (1995): 0.58
Lost Highway (1997): 0.58
Young Poisoner's Handbook, The (1995): 0.58
