In [13]:
from py2neo import Graph
import pandas as pd
import numpy as np

# Step 1: Connect to Neo4j
graph = Graph("bolt://localhost:7687", auth=("neo4j", "password"))

# Step 2: Check if CO_ACTED_WITH relationships exist
print("🚀 Checking some CO_ACTED_WITH relationships...")

🚀 Checking some CO_ACTED_WITH relationships...


In [3]:
print("🚀 Projecting movie graph into memory...")

graph.run("""
CALL gds.graph.project(
  'movieGraph',
  'Movie',
  {
    CO_ACTED_WITH: {
      orientation: 'UNDIRECTED',
      properties: 'weight'
    }
  }
)
""")

🚀 Projecting movie graph into memory...


nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,projectMillis
"{Movie: {label: 'Movie', properties: {}}}","{CO_ACTED_WITH: {aggregation: 'DEFAULT', orientation: 'UNDIRECTED', indexInverse: false, properties: {weight: {aggregation: 'DEFAULT', property: 'weight', defaultValue: null}}, type: 'CO_ACTED_WITH'}}",movieGraph,271826,35606904,24809


In [4]:
print("🚀 Running PageRank and saving results chunk-by-chunk...")

# APOC periodic iterate version
pagerank_query = """
CALL apoc.periodic.iterate(
  "
  CALL gds.pageRank.stream('movieGraph')
  YIELD nodeId, score
  RETURN gds.util.asNode(nodeId) AS movie, score
  ",
  "
  SET movie.pagerank = score
  ",
  {
    batchSize: 10000,
    parallel: false
  }
)
"""

graph.run(pagerank_query)

print("✅ PageRank calculation and writing finished!")

🚀 Running PageRank and saving results chunk-by-chunk...
✅ PageRank calculation and writing finished!


In [6]:
top_movie_query = """
MATCH (m:Movie)
RETURN id(m) AS id, m.title AS title, m.pagerank AS pagerank
ORDER BY m.pagerank DESC
LIMIT 10
"""
top_movie = graph.run(top_movie_query).to_data_frame()
top_movie


Unnamed: 0,id,title,pagerank
0,75201,A Hitman in London,6.213969
1,54634,Desert Fiends,6.011403
2,102091,Papa,5.829158
3,94722,Bleach,5.812615
4,86732,Beyond the Game,5.760066
5,261176,Luck of the Draw,5.659724
6,339488,The Electric Man,5.487784
7,315785,The Butcher,5.456422
8,28582,Spreading Darkness,5.331515
9,103199,Hunting Season,5.32099


pagerank values do not seem accurate, now update it (multiply current pagerank value with avg rating)

In [7]:
movies_df = graph.run("""
MATCH (m:Movie)
RETURN id(m) AS movie_id, m.title AS title, m.pagerank AS pagerank
""").to_data_frame()
movies_df.head()

Unnamed: 0,movie_id,title,pagerank
0,0,The Lift,0.152691
1,3,Adit Sopo Jarwo: The Movie,0.30684
2,4,Deadly Illusions,1.064401
3,5,9 fugas,0.245134
4,6,"Hurrah, We Are Still Alive!",0.689161


In [10]:
ratings_df = graph.run("""
MATCH (m:Movie)
RETURN id(m) AS movie_id, m.rating AS rating, m.votes AS votes
""").to_data_frame()
ratings_df.head()

Unnamed: 0,movie_id,rating,votes
0,0,8.8,27
1,3,5.7,135
2,4,5.8,274
3,5,5.6,28
4,6,4.7,34


In [11]:
# Merge on 'movie_id'
full_df = pd.merge(movies_df, ratings_df, on='movie_id', how='left')
full_df

Unnamed: 0,movie_id,title,pagerank,rating,votes
0,0,The Lift,0.152691,8.8,27
1,3,Adit Sopo Jarwo: The Movie,0.306840,5.7,135
2,4,Deadly Illusions,1.064401,5.8,274
3,5,9 fugas,0.245134,5.6,28
4,6,"Hurrah, We Are Still Alive!",0.689161,4.7,34
...,...,...,...,...,...
271821,344213,Lady Terror,1.212322,6.2,30
271822,344215,Dealova,0.501271,8.2,20
271823,344216,Lail Dakhli,0.308123,5.0,12
271824,344217,Tuesday 12,0.970620,6.4,70


In [15]:
full_df['hybrid_pagerank'] = full_df['pagerank'] * (1 + full_df['rating'] / 10) * np.log10(1 + full_df['votes'])
full_df

Unnamed: 0,movie_id,title,pagerank,rating,votes,hybrid_pagerank
0,0,The Lift,0.152691,8.8,27,0.415421
1,3,Adit Sopo Jarwo: The Movie,0.306840,5.7,135,1.027807
2,4,Deadly Illusions,1.064401,5.8,274,4.102357
3,5,9 fugas,0.245134,5.6,28,0.559235
4,6,"Hurrah, We Are Still Alive!",0.689161,4.7,34,1.564244
...,...,...,...,...,...,...
271821,344213,Lady Terror,1.212322,6.2,30,2.928978
271822,344215,Dealova,0.501271,8.2,20,1.206279
271823,344216,Lail Dakhli,0.308123,5.0,12,0.514848
271824,344217,Tuesday 12,0.970620,6.4,70,2.946863


In [18]:
top_10_movies = full_df.sort_values(by="hybrid_pagerank", ascending=False).head(10)
top_10_movies

Unnamed: 0,movie_id,title,pagerank,rating,votes,hybrid_pagerank
67290,84772,The Hateful Eight,4.615038,7.8,690264,47.966162
211748,273626,Kill Bill: Vol. 1,3.971502,8.2,1248395,44.065249
39095,49094,The Grand Budapest Hotel,3.859574,8.1,924645,41.677288
230090,293540,Kill Bill: Vol. 2,3.883518,8.0,836470,41.399901
169927,229190,Thelma & Louise,4.426787,7.6,183207,41.004363
269697,341431,The Expendables,4.439504,6.4,372524,40.562396
174601,234166,Heat,3.765166,8.3,755882,40.504047
173635,233134,Pulp Fiction,3.142809,8.9,2329644,37.821117
271773,344156,The Dark Knight Rises,3.229418,8.4,1908873,37.321188
159284,218223,Blade Runner,3.468858,8.1,852816,37.237665


In [19]:
# Save only needed columns
hybrid_df = full_df[['movie_id', 'hybrid_pagerank']]

# Save to CSV
hybrid_df.to_csv('C:/Users/Onur Ege/PycharmProjects/GraphDBProject/dataset/processed/movie_hybrid_pagerank_export.csv', index=False)

print("✅ Hybrid pagerank export CSV saved.")

✅ Hybrid pagerank export CSV saved.


run code bottom to load in neo4j

CALL apoc.periodic.iterate(
  "
  LOAD CSV WITH HEADERS FROM 'file:///movie_hybrid_pagerank_export.csv' AS row
  RETURN row
  ",
  "
  MATCH (m:Movie)
  WHERE id(m) = toInteger(row.movie_id)
  SET m.hybrid_pagerank = toFloat(row.hybrid_pagerank)
  ",
  {
    batchSize: 10000,
    parallel: false
  }
)


### node2vec

In [1]:
"""CALL gds.graph.project(
  'node2vecGraph',
  'Movie',
  {
    CO_DIRECTED: { orientation: 'UNDIRECTED' },
    RELATED_TO:  { orientation: 'UNDIRECTED' }
  }
)"""

"CALL gds.graph.project(\n  'node2vecGraph',\n  'Movie',\n  {\n    CO_DIRECTED: { orientation: 'UNDIRECTED' },\n    RELATED_TO:  { orientation: 'UNDIRECTED' }\n  }\n)"

In [2]:
"""CALL gds.beta.node2vec.write('node2vecGraph', {
  embeddingDimension: 16,
  iterations: 5,
  walkLength: 40,
  walksPerNode: 5,
  writeProperty: 'node2vec'
})
YIELD nodeCount;"""

"CALL gds.beta.node2vec.write('node2vecGraph', {\n  embeddingDimension: 16,\n  iterations: 5,\n  walkLength: 40,\n  walksPerNode: 5,\n  writeProperty: 'node2vec'\n})\nYIELD nodeCount;"

### community detection

In [None]:
# Pull embeddings
query = """
MATCH (m:Movie)
RETURN id(m) AS id, m.node2vec AS embedding
"""
df = graph.run(query).to_data_frame()
df = df.dropna() 

In [None]:
# Prepare embedding matrix
import numpy as np
X = np.array(df['embedding'].tolist())

In [None]:
# KMeans clustering
kmeans = KMeans(n_clusters=50, random_state=42)  # Adjust k
df['community'] = kmeans.fit_predict(X)

In [None]:
from tqdm import tqdm
for row in tqdm(df.itertuples(), total=len(df)):
    graph.run("MATCH (m) WHERE id(m) = $id SET m.community_node2vec = $community",
              id=row.id, community=int(row.community))

MATCH (m:Movie)

RETURN m.community_node2vec AS community, count(*) AS size

ORDER BY size DESC;

In [None]:
import matplotlib.pyplot as plt
df['community'].value_counts().plot(kind='bar', figsize=(12,4))
plt.title("Movie Count per Community")

In [None]:
# Pull required features from Neo4j
query = """
MATCH (m:Movie)
RETURN id(m) AS id,
       m.title AS title,
       m.hybrid_pagerank AS hybrid_pagerank,
       m.rating AS rating,
       m.votes AS votes,
       m.community_node2vec AS community
"""
df = graph.run(query).to_data_frame()

In [None]:
df = df.dropna(subset=["hybrid_pagerank", "rating", "votes", "community"])