### This notebook is intended to construct a Neo4j (https://neo4j.com/) graph using the MovieLens dataset.
### Note: The process may take some time to complete.

In [2]:
!pip install -q py2neo==2021.2.4

[0m

In [3]:
import pandas as pd
from py2neo import Graph, Node, Relationship
import pickle

In [4]:
# Replace with your actual Neo4j connection details
uri = "YOUR_URI" 
username = "neo4j"
password = "YOUR_PASSWORD"

graph = Graph(uri, auth=(username, password))

In [5]:
# Load the preprocessed data
with open("./LlamaRec/data/preprocessed/ml-100k_min_rating0-min_uc5-min_sc5/dataset.pkl", "rb") as fin:
    preprocessed_data = pickle.load(fin)
    
print(len(preprocessed_data["umap"]))
print(len(preprocessed_data["smap"]))

610
3650


In [6]:
# Load the ratings
ratings_df = pd.read_csv("./ml-latest-small/ratings.csv", 
                         names=['userId', 'movieId', 'rating', 'timestamp'], 
                         sep=',')
ratings_df = ratings_df.iloc[1:]

users_id = list(set(ratings_df["userId"].to_list()))
print(len(users_id))

610


In [7]:
# Add users to the graph
def addingUser(users_id):
    for id in users_id:
        if int(id) in preprocessed_data["umap"]:
            id = preprocessed_data["umap"][int(id)]
            user_n = Node("User", id=str(id))
            graph.merge(user_n, "User", "id")
        
addingUser(users_id)

# Verification
query = """MATCH (n:User)
RETURN COUNT(n) AS node_count"""
graph.run(query)

node_count
610


In [8]:
# Load movies
movies_df = pd.read_csv("./ml-latest-small/movies.csv", 
                        names=['movieId', 'title', 'genres'], 
                        sep=',')
movies_df = movies_df.iloc[1:]

# Create separate columns for year and title
movies_df["year"] = movies_df["title"].apply(lambda x: x[-5:-1])
movies_df["title"] = movies_df["title"].apply(lambda x: x[:-7])

# List of all genres
all_genres = list(set(movies_df["genres"].tolist()))

# Create mappings for movies and their associated properties
movies_id = dict(zip(movies_df["movieId"].tolist(), movies_df["title"].tolist()))
movies_gen = dict(zip(movies_df["movieId"].tolist(), movies_df["genres"].tolist()))
movies_year = dict(zip(movies_df["movieId"].tolist(), movies_df["year"].tolist()))

print(len(all_genres))
print(len((movies_id)))
print(len((movies_gen)))
print(len((movies_year)))

951
9742
9742
9742


In [9]:
# Save mappings for later use in training
with open("./ml-latest-small/movies_ids_to_titles", "wb") as fout:
    pickle.dump(movies_id, fout)

In [10]:
# Add genres to the graph
def addingGenre(all_genres):
    for genre in all_genres:
        gen_n = Node("Genre", id=genre)
        graph.merge(gen_n, "Genre", "id")
        
addingGenre(all_genres)

# Verification
query = """MATCH (n:Genre)
RETURN COUNT(n) AS node_count"""
graph.run(query)

node_count
951


In [11]:
# Add years to the graph
def addingYear(movies_year):
    for key, val in movies_year.items():
        if int(key) in preprocessed_data["smap"]:
            year_n = Node("Year", id=str(val))
            graph.merge(year_n, "Year", "id")
        
addingYear(movies_year)

# Verification
query = """MATCH (n:Year)
RETURN COUNT(n) AS node_count"""
graph.run(query)

node_count
97


In [12]:
# Add movies to the graph
def addingMovies(movies_id):
    for key, val in movies_id.items():
        if int(key) in preprocessed_data["smap"]:
            key = preprocessed_data["smap"][int(key)]
            mov_n = Node("Movie", id=str(key), title=str(val))
            graph.merge(mov_n, "Movie", "id")
        
addingMovies(movies_id)

# Verification
query = """MATCH (n:Movie)
RETURN COUNT(n) AS node_count"""
graph.run(query)

node_count
3650


In [13]:
# Add bidirectional relationships between movies and genres to the graph
def connectMovieGenre(movies_gen):
    for key, val in movies_gen.items():
        if int(key) in preprocessed_data["smap"]:
            key = preprocessed_data["smap"][int(key)]
    
            query = 'MATCH (m:Movie {id: $movie_id}), (g:Genre {id: $gen_id}) CREATE (m)-[:GENRE_IS]->(g)'
            graph.run(query, movie_id=str(key), gen_id=str(val))

            query = 'MATCH (g:Genre {id: $gen_id}), (m:Movie {id: $movie_id}) CREATE (g)-[:GENRE_INCLUDES]->(m)'
            graph.run(query, movie_id=str(key), gen_id=str(val))
        
connectMovieGenre(movies_gen)

# Verification
query = """MATCH (m:Movie)-[r:GENRE_IS]->(g:Genre)
RETURN COUNT(r) AS relationship_count;"""
print(graph.run(query))

query = """MATCH (g:Genre)-[r:GENRE_INCLUDES]->(m:Movie)
RETURN COUNT(r) AS relationship_count;"""
print(graph.run(query))

 relationship_count 
--------------------
               3650 

 relationship_count 
--------------------
               3650 



In [14]:
# Add bidirectional relationships between movies and years to the graph
def connectMovieYear(movies_year):
    for key, val in movies_year.items():
        if int(key) in preprocessed_data["smap"]:
            key = preprocessed_data["smap"][int(key)]
            
            query = 'MATCH (m:Movie {id: $movie_id}), (y:Year {id: $year_id}) CREATE (m)-[:RELEASED_YEAR_IS]->(y)'
            graph.run(query, movie_id=str(key), year_id=str(val))

            query = 'MATCH (y:Year {id: $year_id}), (m:Movie {id: $movie_id}) CREATE (y)-[:YEAR_INCLUDES]->(m)'
            graph.run(query, movie_id=str(key), year_id=str(val))
        
connectMovieYear(movies_year)

# Verification
query = """MATCH (m:Movie)-[r:RELEASED_YEAR_IS]->(y:Year)
RETURN COUNT(r) AS relationship_count;"""
print(graph.run(query))

query = """MATCH (y:Year)-[r:YEAR_INCLUDES]->(m:Movie)
RETURN COUNT(r) AS relationship_count;"""
print(graph.run(query))

 relationship_count 
--------------------
               3650 

 relationship_count 
--------------------
               3650 



In [15]:
# Add rating relationships between users and movies to the graph
print(ratings_df.shape)

for index, row in ratings_df.iterrows():
    if int(row["userId"]) in preprocessed_data["umap"] and int(row["movieId"]) in preprocessed_data["smap"]:
        user_id = preprocessed_data["umap"][int(row["userId"])]
        movie_id = preprocessed_data["smap"][int(row["movieId"])]
        rate = row["rating"]
        time = row["timestamp"]

        query = 'MATCH (u:User {id: $user_id}), (m:Movie {id: $movie_id}) CREATE (u)-[:RATED { rating: $rating, timestamp: $timestamp } ]->(m)'
        graph.run(query, user_id=str(user_id), movie_id=str(movie_id), rating=rate, timestamp=time)

        if index % 1000 == 0:
            print(index)
    
# Verification
query = """MATCH (u:User)-[r:RATED]->(m:Movie)
RETURN COUNT(r) AS relationship_count;"""
print(graph.run(query))

(100836, 4)
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
15000
16000
17000
18000
19000
20000
21000
22000
24000
25000
26000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
53000
54000
55000
56000
57000
58000
59000
61000
63000
64000
65000
66000
67000
71000
72000
73000
74000
76000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
95000
96000
98000
99000
 relationship_count 
--------------------
              90274 



In [16]:
# Load movies metadata, actors and directors
movies_info_df = pd.read_csv("./ml-latest-small/movies_metadata.csv")
movies_info_df = movies_info_df[["movieId", "directors", "actors"]]
print(movies_info_df.shape)

(8269, 3)


In [17]:
# Add bidirectional relationships between movies, actors and directors to the graph
for index, row in movies_info_df.iterrows():
    if int(row["movieId"]) in preprocessed_data["smap"]:
        movie_id = str(preprocessed_data["smap"][int(row["movieId"])])
    
        if not isinstance(row["directors"], float):
            dirs = row["directors"].split("|")

            for dir in dirs:
                dir_n = Node("Dir", id=str(dir))
                graph.merge(dir_n, "Dir", "id")

                query = 'MATCH (m:Movie {id: $movie_id}), (d:Dir {id: $dir_id}) CREATE (m)-[r:DIRECTED_BY]->(d)'
                graph.run(query, movie_id=movie_id, dir_id=str(dir))

                query = 'MATCH (d:Dir {id: $dir_id}), (m:Movie {id: $movie_id}) CREATE (d)-[r:IS_THE_DIRECTOR_OF]->(m)'
                graph.run(query, movie_id=movie_id, dir_id=str(dir))

        if not isinstance(row["actors"], float):
            acts = row["actors"].split("|")

            for act in acts:
                act_n = Node("Act", id=str(act))
                graph.merge(act_n, "Act", "id")

                query = 'MATCH (m:Movie {id: $movie_id}), (a:Act {id: $act_id}) CREATE (m)-[r:HAS_ACTOR]->(a)'
                graph.run(query, movie_id=movie_id, act_id=str(act))

                query = 'MATCH (a:Act {id: $act_id}), (m:Movie {id: $movie_id}) CREATE (a)-[r:ACTED_IN]->(m)'
                graph.run(query, movie_id=movie_id, act_id=str(act))

        if index % 1000 == 0:
            print(index)

# Verification
query = """MATCH (m:Movie)-[r:DIRECTED_BY]->(d:Dir)
RETURN COUNT(r) AS relationship_count;"""
print(graph.run(query))

query = """MATCH (d:Dir)-[r:IS_THE_DIRECTOR_OF]->(m:Movie)
RETURN COUNT(r) AS relationship_count;"""
print(graph.run(query))

query = """MATCH (m:Movie)-[r:HAS_ACTOR]->(a:Act)
RETURN COUNT(r) AS relationship_count;"""
print(graph.run(query))

query = """MATCH (a:Act)-[r:ACTED_IN]->(m:Movie)
RETURN COUNT(r) AS relationship_count;"""
print(graph.run(query))

0
1000
2000
4000
8000
 relationship_count 
--------------------
               3344 

 relationship_count 
--------------------
               3344 

 relationship_count 
--------------------
               9220 

 relationship_count 
--------------------
               9220 

