In [None]:
!pip install neo4j langchain openai pandas numpy

In [None]:
import os
from neo4j import GraphDatabase
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain.embeddings.openai import OpenAIEmbeddings
import json
import time

# Set up API and connection info
os.environ['OPENAI_API_KEY'] = "YOUR_OPENAI_API_KEY"

url = "bolt://localhost:7687"  # Local Neo4j URI
username = "neo4j"  # Neo4j username
password = "password"  # Neo4j password
driver = GraphDatabase.driver(url, auth=(username, password))

In [None]:
# Function to Create Knowledge Graph in Neo4j
#SKIP if importing the graph file, will create seperate code block for this later
#TAkes <30 hours on free AuraDB
def create_knowledge_graph(uri, user, password, json_file, batch_size=10):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    
    def create_graph(tx, user_data):
        # Create user nodes
        for user_id, user_info in user_data.items():
            prompt = user_info.get('prompt', '')
            tx.run("""
                MERGE (u:User {user_id: $user_id})
                SET u += {age: $age, gender: $gender, occupation: $occupation, zip_code: $zip_code, prompt: $prompt}
            """, user_id=user_id, age=user_info['age'], gender=user_info['gender'], occupation=user_info['occupation'], zip_code=user_info['zip_code'], prompt=prompt)

            # Create movie nodes and relationships
            for rating in user_info['ratings']:
                tx.run("""
                    MERGE (m:Movie {movie_id: $movie_id})
                    SET m += {title: $title, genres: $genres}
                """, movie_id=rating['movie_id'], title=rating['title'], genres=rating['genres'])
                
                tx.run("""
                    MATCH (u:User {user_id: $user_id}), (m:Movie {movie_id: $movie_id})
                    MERGE (u)-[r:RATED {rating: $rating, timestamp: $timestamp}]->(m)
                """, user_id=user_id, movie_id=rating['movie_id'], rating=rating['rating'], timestamp=rating['timestamp'])

    with driver.session() as session:
        # Load the JSON data
        with open(json_file, 'r') as file:
            user_data = json.load(file)

        user_ids = list(user_data.keys())
        total_users = len(user_ids)
        
        # Process in batches
        for i in range(0, total_users, batch_size):
            batch_user_ids = user_ids[i:i+batch_size]
            batch_data = {user_id: user_data[user_id] for user_id in batch_user_ids}
            
            start_time = time.time()
            session.write_transaction(create_graph, batch_data)
            end_time = time.time()
            
            print(f"Processed batch {i//batch_size + 1}/{(total_users + batch_size - 1) // batch_size} in {end_time - start_time:.2f} seconds")
    
    driver.close()

In [None]:
#Run the Knowledge Graph Creation
# Update the path to your JSON file
json_file = "restructured_user_data.json"

create_knowledge_graph(url, username, password, json_file)

In [None]:
# Function to Create Embeddings for User and Movie Nodes
def create_embeddings_for_graph():
    
    # Create embeddings for user nodes
    user_vector_index = Neo4jVector.from_existing_graph(
        OpenAIEmbeddings(),
        url=url,
        username=username,
        password=password,
        index_name='users',
        node_label="User",
        text_node_properties=['age', 'gender', 'occupation', 'zip_code'],  # Including relevant user properties
        embedding_node_property='embedding',
    )

    # Create embeddings for movie nodes
    movie_vector_index = Neo4jVector.from_existing_graph(
        OpenAIEmbeddings(),
        url=url,
        username=username,
        password=password,
        index_name='movies',
        node_label="Movie",
        text_node_properties=['title', 'genres'],  # Combining title and genres for embeddings
        embedding_node_property='embedding',
    )

    print("Embeddings for user and movie nodes have been created and stored in Neo4j.")

In [None]:
# Run the Embedding Creation
create_embeddings_for_graph()


In [None]:
#Set Up RetrievalQA and Run a Query

from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

# Initialize the RetrievalQA chain
vector_qa = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(),
    chain_type="stuff",
    retriever=user_vector_index.as_retriever()
)

# Run a query
result = vector_qa.run(
    "How will recommendation service be updated?"
)
print(result)

In [None]:
# Function to retrieve users who reviewed the movie
def get_users_who_reviewed_movie(movie_id, k):
    with driver.session() as session:
        result = session.run("""
            MATCH (u:User)-[r:RATED]->(m:Movie {movie_id: $movie_id})
            RETURN u.user_id AS user_id, u.age AS age, u.gender AS gender, u.occupation AS occupation, u.zip_code AS zip_code, r.rating AS rating, r.timestamp AS timestamp
            LIMIT $k
        """, movie_id=movie_id, k=k)
        return [record for record in result]

# Function to retrieve users who reviewed the movie and have similar user info
def get_similar_users_by_info(target_user_id, movie_id, k):
    with driver.session() as session:
        result = session.run("""
            MATCH (target:User {user_id: $target_user_id})-[:RATED]->(m:Movie {movie_id: $movie_id})
            MATCH (u:User)-[r:RATED]->(m)
            WHERE u.age = target.age AND u.gender = target.gender AND u.occupation = target.occupation AND u.zip_code = target.zip_code AND u <> target
            RETURN u.user_id AS user_id, u.age AS age, u.gender AS gender, u.occupation AS occupation, u.zip_code AS zip_code, r.rating AS rating, r.timestamp AS timestamp
            LIMIT $k
        """, target_user_id=target_user_id, movie_id=movie_id, k=k)
        return [record for record in result]

# Function to retrieve users who reviewed the movie and have similar ratings in common movies
def get_similar_users_by_ratings(target_user_id, movie_id, k):
    with driver.session() as session:
        result = session.run("""
            MATCH (target:User {user_id: $target_user_id})-[:RATED]->(m:Movie)<-[:RATED]-(u:User)
            WHERE m.movie_id = $movie_id AND u <> target
            WITH u, target, m, collect(m) AS common_movies
            MATCH (u)-[r1:RATED]->(common_movie)<-[r2:RATED]-(target)
            WHERE common_movie IN common_movies
            WITH u, target, avg(abs(r1.rating - r2.rating)) AS avg_rating_diff, count(common_movie) AS num_common_movies
            WHERE num_common_movies > 0
            RETURN u.user_id AS user_id, u.age AS age, u.gender AS gender, u.occupation AS occupation, u.zip_code AS zip_code, avg_rating_diff, num_common_movies
            ORDER BY avg_rating_diff ASC, num_common_movies DESC
            LIMIT $k
        """, target_user_id=target_user_id, movie_id=movie_id, k=k)
        return [record for record in result]

# Example usage
movie_id = 1
target_user_id = 1
k = 10

# Retrieve users based on different criteria
users_by_review = get_users_who_reviewed_movie(movie_id, k)
users_by_info = get_similar_users_by_info(target_user_id, movie_id, k)
users_by_ratings = get_similar_users_by_ratings(target_user_id, movie_id, k)

# Initialize the RetrievalQA chain
vector_qa = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(),
    chain_type="stuff",
    retriever=user_vector_index.as_retriever()
)

# Construct the prompt for the QA system
def construct_prompt(title, genres):
    return f"What rating will the user give the movie from 1 - 5 based on the movie title and categories? Title: {title} Genre: {', '.join(genres)}"

# Run the QA system with the constructed prompt
title = "Some Movie Title"
genres = ["Action", "Drama"]

prompt = construct_prompt(title, genres)
result = vector_qa.run(prompt)
print(result)