In [82]:
import csv
import numpy as np
from py2neo import Graph
import pandas as pd

## Connect with the graph database

In [83]:
host = "bolt://localhost:7687" # replace this with your Sandbox host
password = "abc123@" # replace this with your Sandbox password
username = "neo4j" #  replace this with your Sandbox username
graph = Graph(host,auth=("neo4j", password))

## Read and Create User Node in database

In [84]:
with open('./data/ratings.csv') as f:
    reader = csv.DictReader(f, delimiter=",")
    userids = range(1, max([int(x['userId']) for x in reader]))

In [85]:
with driver.session() as session:
    numbers = {"nodes": [{'userid': x} for x in userids]}
    create_userid_query = '''
    UNWIND $nodes as node
    CREATE (n:UserID {userid: node.userid})
    '''
    
    result = graph.run(create_userid_query,nodes=numbers)

## Read and Create Movies Node in database

In [86]:
with driver.session() as session:
    with open('./data/movies.csv') as f:
        reader = csv.DictReader(f, delimiter=",")
        movies = {"nodes": [{'movieId': x['movieId'], 'name': x['title']} for x in reader]}
        create_userid_query = '''
        UNWIND $nodes as node
        CREATE (n:MovieId {movieId: node.movieId, name: node.name})
        '''
        result = graph.run(create_userid_query, movies)

## Read and Create Genre Node in database

In [87]:
with driver.session() as session:
    with open('./data/movies.csv') as f:
        reader = csv.DictReader(f, delimiter=",")
        genres = [x['genres'].split('|') for x in reader]
        genres = sum(genres, [])
        genres = list(set(genres))
        print(genres)
        genres = {"nodes": [{'name': x} for x in genres]}
    create_genre_nodes = '''
        UNWIND $nodes as node
        CREATE (n:Genre {name: node.name})
    '''
    result = session.run(create_genre_nodes, genres)

['Musical', 'Thriller', 'Animation', 'Mystery', 'Action', 'Adventure', 'Documentary', 'Crime', 'Children', 'Western', '(no genres listed)', 'Comedy', 'Drama', 'War', 'Horror', 'Film-Noir', 'Romance', 'Sci-Fi', 'IMAX', 'Fantasy']


## Building a relationship between movies and genre nodes

In [88]:
print('Create the movieid-belongsto->genres relationship')
with driver.session() as session:
    with open('./data/movies.csv') as f:
        reader = csv.DictReader(f, delimiter=",")
        for line in reader:
            movieid = line['movieId']
            genres = line['genres'].split('|')
            movies = {"records": [{'movieId': movieid, 'genres': genres}]}
            create_movie_genre_relationship = '''
                UNWIND $records as record
                    MATCH (a:MovieId) where a.movieId=record.movieId
                    MATCH (b:Genre) where b.name in record.genres
                    CREATE (a)-[:BELONGSTO]->(b)
            '''
            result = graph.run(create_movie_genre_relationship, movies)

Create the movieid-belongsto->genres relationship


### Create a edgelist from our knowledge graph for node embeddings

In [89]:
%%bash
git clone git@github.com:snap-stanford/snap.git
cd snap/examples/node2vec
make

Cloning into 'snap'...


make -C ../../snap-core
make[1]: Entering directory '/home/osama/Desktop/MS-DS/Graph-based-ml/snap/snap-core'
g++ -c -std=c++98 -Wall -O3 -DNDEBUG -fopenmp Snap.cpp -I../glib-core


In file included from ../glib-core/base.h:181,
                 from Snap.h:9,
                 from Snap.cpp:4:
../glib-core/dt.h: In static member function ‘static int TCh::GetHex(const char&)’:
 1076 |     else Fail; return 0;}
      |     ^~~~
../glib-core/dt.h:1076:16: note: ...this statement, but the latter is misleadingly indented as if it were guarded by the ‘else’
 1076 |     else Fail; return 0;}
      |                ^~~~~~
../glib-core/dt.h: In static member function ‘static char TCh::GetHexCh(const int&)’:
 1080 |     else Fail; return 0;}
      |     ^~~~
../glib-core/dt.h:1080:16: note: ...this statement, but the latter is misleadingly indented as if it were guarded by the ‘else’
 1080 |     else Fail; return 0;}
      |                ^~~~~~
In file included from ../glib-core/base.h:183,
                 from Snap.h:9,
                 from Snap.cpp:4:
../glib-core/ds.h: In member function ‘bool TTuple<TVal, NVals>::operator==(const TTuple<TVal, NVals>&) const’:
  290 

g++ -std=c++98 -Wall -O3 -DNDEBUG -fopenmp -o testSnap testSnap.cpp Snap.o -I../glib-core  -lrt


In file included from ../glib-core/base.h:181,
                 from Snap.h:9,
                 from testSnap.cpp:3:
../glib-core/dt.h: In static member function ‘static int TCh::GetHex(const char&)’:
 1076 |     else Fail; return 0;}
      |     ^~~~
../glib-core/dt.h:1076:16: note: ...this statement, but the latter is misleadingly indented as if it were guarded by the ‘else’
 1076 |     else Fail; return 0;}
      |                ^~~~~~
../glib-core/dt.h: In static member function ‘static char TCh::GetHexCh(const int&)’:
 1080 |     else Fail; return 0;}
      |     ^~~~
../glib-core/dt.h:1080:16: note: ...this statement, but the latter is misleadingly indented as if it were guarded by the ‘else’
 1080 |     else Fail; return 0;}
      |                ^~~~~~
In file included from ../glib-core/base.h:183,
                 from Snap.h:9,
                 from testSnap.cpp:3:
../glib-core/ds.h: In member function ‘bool TTuple<TVal, NVals>::operator==(const TTuple<TVal, NVals>&) const’

make[1]: Leaving directory '/home/osama/Desktop/MS-DS/Graph-based-ml/snap/snap-core'
g++ -std=c++98 -Wall -O3 -DNDEBUG -fopenmp  -o node2vec node2vec.cpp ../../snap-adv/n2v.cpp ../../snap-adv/word2vec.cpp ../../snap-adv/biasedrandomwalk.cpp ../../snap-core/Snap.o -I../../snap-core -I../../snap-adv -I../../glib-core -I../../snap-exp  -lrt


In file included from ../../glib-core/base.h:181,
                 from ../../snap-core/Snap.h:9,
                 from stdafx.h:5,
                 from node2vec.cpp:1:
../../glib-core/dt.h: In static member function ‘static int TCh::GetHex(const char&)’:
 1076 |     else Fail; return 0;}
      |     ^~~~
../../glib-core/dt.h:1076:16: note: ...this statement, but the latter is misleadingly indented as if it were guarded by the ‘else’
 1076 |     else Fail; return 0;}
      |                ^~~~~~
../../glib-core/dt.h: In static member function ‘static char TCh::GetHexCh(const int&)’:
 1080 |     else Fail; return 0;}
      |     ^~~~
../../glib-core/dt.h:1080:16: note: ...this statement, but the latter is misleadingly indented as if it were guarded by the ‘else’
 1080 |     else Fail; return 0;}
      |                ^~~~~~
In file included from ../../glib-core/base.h:183,
                 from ../../snap-core/Snap.h:9,
                 from stdafx.h:5,
                 from node2vec

cp node2vec ../Release


In [55]:
print('Create the edge list')
with driver.session() as session, open("graph/movies.edgelist", "w") as edges_file:
    result = session.run("""\
    MATCH (m:MovieId)--(other)
    RETURN id(m) AS source, id(other) AS target
    """)

    writer = csv.writer(edges_file, delimiter=" ")

    for row in result:
        writer.writerow([row["source"], row["target"]])

Create the edge list


In [94]:
%%bash
./snap/examples/node2vec/node2vec -i:./graph/movies.edgelist -o:./emb/movies.emb -l:80 -d:100 -p:0.3 -dr -v


An algorithmic framework for representational learning on graphs. [Nov 30 2022]
Input graph path (-i:)=./graph/movies.edgelist
Output graph path (-o:)=./emb/movies.emb
Number of dimensions. Default is 128 (-d:)=100
Length of walk per source. Default is 80 (-l:)=80
Number of walks per source. Default is 10 (-r:)=10
Context size for optimization. Default is 10 (-k:)=10
Number of epochs in SGD. Default is 1 (-e:)=1
Return hyperparameter. Default is 1 (-p:)=0.3
Inout hyperparameter. Default is 1 (-q:)=1
Verbose output. (-v)=YES
Graph is directed. (-dr)=YES
Graph is weighted. (-w)=NO
Output random walks instead of embeddings. (-ow)=NO
Read 22084 lines from ./graph/movies.edgelist
Preprocessing progress: 67.61% 
Walking Progress: 92.19%
Learning Progress: 99.88% 


In [95]:
with open("emb/movies.emb", "r") as movies_file, driver.session() as session:
    next(movies_file)
    reader = csv.reader(movies_file, delimiter=" ")

    params = []
    for row in reader:
        movie_id = row[0]
        params.append({
            "id": int(movie_id),
            "embedding": [float(item) for item in row[1:]]
        })

    session.run("""\
    UNWIND $params AS param
    MATCH (m:MovieId) WHERE id(m) = param.id
    SET m.embedding = param.embedding
    """, {"params": params})

### Build movie simliarity model using node embeddings

In [96]:

movies_genres_query = """\
MATCH (genre:Genre)
WITH genre ORDER BY genre.name
WITH collect(id(genre)) AS genres
MATCH (m:MovieId)-[:BELONGSTO]->(genre)
WITH genres, id(m) AS source, m.embedding AS embedding, collect(id(genre)) AS target
RETURN source, embedding, [g in genres | CASE WHEN g in target THEN 1 ELSE 0 END] AS genres
"""

with driver.session() as session:
    result = session.run(movies_genres_query)
    df = pd.DataFrame([dict(row) for row in result])

In [97]:
from gensim.models import KeyedVectors

In [98]:
filename = 'emb/movies.emb'


In [99]:
model = KeyedVectors.load_word2vec_format(filename, binary=False)


In [102]:
def neo4j_most_similar(model, key):
    with driver.session() as session:
        find_movie_query = "MATCH (m:MovieId {name: '%s'}) return id(m)" % key
        result = session.run(find_movie_query)
        for r in result:
            similar_movies = model.most_similar(str(r.value()))
            for s_movie in similar_movies:
                find_movie_query = "MATCH (m:MovieId) where id(m) = %s return m.name" % s_movie[0]
                similar_movie_names = session.run(find_movie_query)
                for sm in similar_movie_names:
                    print(sm.value(), s_movie[1])

In [103]:
neo4j_most_similar(model, 'Money Train (1995)')

Love Affair (1939) 0.9976007342338562
Florence Foster Jenkins (2016) 0.9974488615989685
Fever Pitch (2005) 0.9974387884140015
Replacement Killers, The (1998) 0.9974029064178467
Clerks (1994) 0.9973879456520081
Step Brothers (2008) 0.9973440766334534
Action Jackson (1988) 0.997317910194397
Before the Devil Knows You're Dead (2007) 0.9972738027572632
Twin Dragons (Shuang long hui) (1992) 0.9971912503242493
Massu Engira Maasilamani (2015) 0.9971174597740173


KeyError: "Key '10392' not present in vocabulary"