In [8]:
import pandas as pd
from neo4j import GraphDatabase

# -------------------------
# Configuration
# -------------------------
# Update these variables as needed
NEO4J_URI = "bolt://localhost:7687"  # This is standard if you're running locally.
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "argentic"  # Your chosen password

# CSV file paths (adjust as necessary)
USERS_CSV = "users.csv"
FLIGHTS_CSV = "adjusted_flights.csv"
CITIES_CSV = "adjusted_cities.csv"
HOTELS_CSV = "adjusted_hotels.csv"
RESTAURANTS_CSV = "adjusted_restaurants.csv"
PREFERENCES_CSV = "preferences.csv"

# -------------------------
# Neo4j Driver
# -------------------------
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

# -------------------------
# Functions to create nodes
# -------------------------
def create_user_node(tx, user_props):
    query = """
    MERGE (u:User {user_id: $user_id})
    SET u += $props
    """
    tx.run(query, user_id=user_props['user_id'], props=user_props)

def create_flight_node(tx, flight_props):
    query = """
    MERGE (f:Flight {flight_id: $flight_id})
    SET f += $props
    """
    tx.run(query, flight_id=flight_props['flight_id'], props=flight_props)

def create_city_node(tx, city_props):
    query = """
    MERGE (c:City {city_id: $city_id})
    SET c += $props
    """
    tx.run(query, city_id=city_props['city_id'], props=city_props)

def create_hotel_node(tx, hotel_props):
    query = """
    MERGE (h:Hotel {hotel_id: $hotel_id})
    SET h += $props
    """
    tx.run(query, hotel_id=hotel_props['hotel_id'], props=hotel_props)

def create_restaurant_node(tx, restaurant_props):
    query = """
    MERGE (r:Restaurant {restaurant_id: $restaurant_id})
    SET r += $props
    """
    tx.run(query, restaurant_id=restaurant_props['restaurant_id'], props=restaurant_props)

def create_preference_node(tx, pref_props):
    query = """
    MERGE (p:Preference {preference_id: $preference_id})
    SET p += $props
    """
    tx.run(query, preference_id=pref_props['preference_id'], props=pref_props)

# -------------------------
# Function to create relationships
# -------------------------
def create_relationship(tx, label_from, key_from, value_from, 
                        rel_type, label_to, key_to, value_to):
    query = f"""
    MATCH (a:{label_from} {{{key_from}: $value_from}})
    MATCH (b:{label_to} {{{key_to}: $value_to}})
    MERGE (a)-[r:{rel_type}]->(b)
    """
    tx.run(query, value_from=value_from, value_to=value_to)

# -------------------------
# Main function to build the graph
# -------------------------
def build_graph():
    # Load CSV files
    users_df = pd.read_csv(USERS_CSV)
    flights_df = pd.read_csv(FLIGHTS_CSV)
    cities_df = pd.read_csv(CITIES_CSV)
    hotels_df = pd.read_csv(HOTELS_CSV)
    restaurants_df = pd.read_csv(RESTAURANTS_CSV)
    prefs_df = pd.read_csv(PREFERENCES_CSV)
    
    with driver.session() as session:
        # Create User nodes
        for _, row in users_df.iterrows():
            user_props = row.to_dict()
            session.execute_write(create_user_node, user_props)
        
        # Create Flight nodes
        for _, row in flights_df.iterrows():
            flight_props = row.to_dict()
            session.execute_write(create_flight_node, flight_props)
        
        # Create City nodes
        for _, row in cities_df.iterrows():
            city_props = row.to_dict()
            session.execute_write(create_city_node, city_props)
        
        # Create Hotel nodes
        for _, row in hotels_df.iterrows():
            hotel_props = row.to_dict()
            session.execute_write(create_hotel_node, hotel_props)
        
        # Create Restaurant nodes
        for _, row in restaurants_df.iterrows():
            restaurant_props = row.to_dict()
            session.execute_write(create_restaurant_node, restaurant_props)
        
        # Create Preference nodes
        for _, row in prefs_df.iterrows():
            pref_props = row.to_dict()
            session.execute_write(create_preference_node, pref_props)
        
        # -------------------------
        # Create Relationships
        # -------------------------
        # Users BOOKED_FLIGHT Flights
        for _, row in flights_df.iterrows():
            user_id = row.get('user_id')
            flight_id = row.get('flight_id')
            if pd.notnull(user_id) and pd.notnull(flight_id):
                session.execute_write(create_relationship,
                                      "User", "user_id", user_id,
                                      "BOOKED_FLIGHT",
                                      "Flight", "flight_id", flight_id)
        
        # Flights HAS_FLIGHT Cities
        for _, row in flights_df.iterrows():
            flight_id = row.get('flight_id')
            city_id = row.get('city_id')  # destination city
            if pd.notnull(flight_id) and pd.notnull(city_id):
                session.execute_write(create_relationship,
                                      "Flight", "flight_id", flight_id,
                                      "HAS_FLIGHT",
                                      "City", "city_id", city_id)
        
        # Users VISITED Cities
        for _, row in flights_df.iterrows():
            user_id = row.get('user_id')
            city_id = row.get('city_id')
            if pd.notnull(user_id) and pd.notnull(city_id):
                session.execute_write(create_relationship,
                                      "User", "user_id", user_id,
                                      "VISITED",
                                      "City", "city_id", city_id)
        
        # Users STAYED_AT Hotels
        for _, row in hotels_df.iterrows():
            user_id = row.get('user_id')
            hotel_id = row.get('hotel_id')
            if pd.notnull(user_id) and pd.notnull(hotel_id):
                session.execute_write(create_relationship,
                                      "User", "user_id", user_id,
                                      "STAYED_AT",
                                      "Hotel", "hotel_id", hotel_id)
        
        # Users DINED_AT Restaurants
        for _, row in restaurants_df.iterrows():
            user_id = row.get('user_id')
            restaurant_id = row.get('restaurant_id')
            if pd.notnull(user_id) and pd.notnull(restaurant_id):
                session.execute_write(create_relationship,
                                      "User", "user_id", user_id,
                                      "DINED_AT",
                                      "Restaurant", "restaurant_id", restaurant_id)
        
        # Users WANTS_TO_VISIT Cities
        if "wants_to_visit_city_id" in users_df.columns:
            for _, row in users_df.iterrows():
                user_id = row.get("user_id")
                city_id = row.get("wants_to_visit_city_id")
                if pd.notnull(user_id) and pd.notnull(city_id):
                    session.execute_write(create_relationship,
                                          "User", "user_id", user_id,
                                          "WANTS_TO_VISIT",
                                          "City", "city_id", city_id)
        
        # Users HAS_PREFERENCE Preferences
        for _, row in prefs_df.iterrows():
            user_id = row.get('user_id')
            pref_id = row.get('preference_id')
            if pd.notnull(user_id) and pd.notnull(pref_id):
                session.execute_write(create_relationship,
                                      "User", "user_id", user_id,
                                      "HAS_PREFERENCE",
                                      "Preference", "preference_id", pref_id)

if __name__ == "__main__":
    build_graph()
    driver.close()
    print("Graph database build complete!")



Graph database build complete!


In [30]:
import pandas as pd
from neo4j import GraphDatabase
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# -------------------------
# Neo4j Connection Setup
# -------------------------
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "argentic"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

# -------------------------
# CSV File Paths (adjust if needed)
# -------------------------
USERS_CSV = "users.csv"
FLIGHTS_CSV = "adjusted_flights.csv"
CITIES_CSV = "adjusted_cities.csv"
HOTELS_CSV = "adjusted_hotels.csv"
RESTAURANTS_CSV = "adjusted_restaurants.csv"
PREFERENCES_CSV = "preferences.csv"

# -------------------------
# Load GPT-2 Models for Query and Answer Generation
# -------------------------
query_model_name = "gpt2"
query_tokenizer = GPT2Tokenizer.from_pretrained(query_model_name)
query_model = GPT2LMHeadModel.from_pretrained(query_model_name)

answer_model_name = "gpt2"
answer_tokenizer = GPT2Tokenizer.from_pretrained(answer_model_name)
answer_model = GPT2LMHeadModel.from_pretrained(answer_model_name)

# -------------------------
# Custom Query Generator using GPT-2 with Fallback
# -------------------------
def generate_cypher_query(user_input, max_length=100):
    """
    Generates a Cypher query from the user input using GPT-2.
    For production use, fine-tune this model on query examples.
    Includes a fallback if the output doesn't look like a Cypher query.
    """
    prompt = f"Generate a Cypher query for the following request:\nRequest: {user_input}\nCypher Query:"
    input_ids = query_tokenizer.encode(prompt, return_tensors="pt")
    # Create an attention mask
    attention_mask = torch.ones_like(input_ids)
    output = query_model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        early_stopping=True,
        pad_token_id=query_tokenizer.eos_token_id
    )
    generated_text = query_tokenizer.decode(output[0], skip_special_tokens=True)
    
    if "Cypher Query:" in generated_text:
        cypher_query = generated_text.split("Cypher Query:")[-1].strip()
    else:
        cypher_query = generated_text.strip()
    
    # Fallback check: Valid Cypher queries typically start with keywords like MATCH, CREATE, MERGE, etc.
    valid_keywords = ("MATCH", "CREATE", "MERGE", "OPTIONAL MATCH")
    if not cypher_query.upper().startswith(valid_keywords):
        print("Warning: Generated query doesn't start with a valid Cypher keyword.")
        print("Generated Query was:", cypher_query)
        # Fallback: use a default query (e.g., query the first 5 User nodes)
        cypher_query = "MATCH (u:User) RETURN u LIMIT 5"
    return cypher_query

# -------------------------
# Custom Answer Generator using GPT-2 with max_new_tokens
# -------------------------
def generate_final_answer(context, user_input, max_new_tokens=100):
    """
    Generates a final answer based on retrieved context and the original user input.
    Uses `max_new_tokens` to generate a fixed number of tokens in addition to the input.
    """
    prompt = f"User Request: {user_input}\nRetrieved Context:\n{context}\nFinal Answer:"
    input_ids = answer_tokenizer.encode(prompt, return_tensors="pt")
    # Create an attention mask matching the input ids
    attention_mask = torch.ones_like(input_ids)
    output = answer_model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        early_stopping=True,
        pad_token_id=answer_tokenizer.eos_token_id
    )
    generated_text = answer_tokenizer.decode(output[0], skip_special_tokens=True)
    if "Final Answer:" in generated_text:
        final_answer = generated_text.split("Final Answer:")[-1].strip()
    else:
        final_answer = generated_text.strip()
    return final_answer

# -------------------------
# Neo4j Data Retrieval Function
# -------------------------
def retrieve_context_from_neo4j(cypher_query):
    """
    Executes the provided Cypher query and returns results as a string.
    """
    results = []
    with driver.session() as session:
        result = session.run(cypher_query)
        for record in result:
            results.append(dict(record))
    return "\n".join(str(r) for r in results)

# -------------------------
# RAG Pipeline Functions
# -------------------------
def rag_pipeline_custom(user_input):
    """
    A complete RAG pipeline that:
      1. Generates a Cypher query from the user input.
      2. Retrieves context from Neo4j using that query.
      3. Generates a final answer using the retrieved context.
    """
    # Generate Cypher Query using the custom query generator.
    cypher_query = generate_cypher_query(user_input)
    print("Generated Cypher Query:")
    print(cypher_query)
    
    # Retrieve context from Neo4j.
    context = retrieve_context_from_neo4j(cypher_query)
    print("\nRetrieved Context:")
    print(context)
    
    # Generate final answer using the custom answer generator.
    final_answer = generate_final_answer(context, user_input)
    return final_answer

# -------------------------
# (Optional) Build the Neo4j Graph from CSVs
# -------------------------
def build_graph():
    """
    Loads CSV files and builds nodes and relationships in Neo4j.
    This function should have been run previously.
    """
    users_df = pd.read_csv(USERS_CSV)
    flights_df = pd.read_csv(FLIGHTS_CSV)
    cities_df = pd.read_csv(CITIES_CSV)
    hotels_df = pd.read_csv(HOTELS_CSV)
    restaurants_df = pd.read_csv(RESTAURANTS_CSV)
    prefs_df = pd.read_csv(PREFERENCES_CSV)
    
    def create_node(tx, label, key, props):
        query = f"MERGE (n:{label} {{{key}: $value}}) SET n += $props"
        tx.run(query, value=props[key], props=props)
    
    def create_relationship(tx, label_from, key_from, value_from, rel_type, label_to, key_to, value_to):
        query = f"""
        MATCH (a:{label_from} {{{key_from}: $value_from}})
        MATCH (b:{label_to} {{{key_to}: $value_to}})
        MERGE (a)-[r:{rel_type}]->(b)
        """
        tx.run(query, value_from=value_from, value_to=value_to)
    
    with driver.session() as session:
        # Create User nodes.
        for _, row in users_df.iterrows():
            session.execute_write(create_node, "User", "user_id", row.to_dict())
        # Create Flight nodes.
        for _, row in flights_df.iterrows():
            session.execute_write(create_node, "Flight", "flight_id", row.to_dict())
        # Create City nodes.
        for _, row in cities_df.iterrows():
            session.execute_write(create_node, "City", "city_id", row.to_dict())
        # Create Hotel nodes.
        for _, row in hotels_df.iterrows():
            session.execute_write(create_node, "Hotel", "hotel_id", row.to_dict())
        # Create Restaurant nodes.
        for _, row in restaurants_df.iterrows():
            session.execute_write(create_node, "Restaurant", "restaurant_id", row.to_dict())
        # Create Preference nodes.
        for _, row in prefs_df.iterrows():
            session.execute_write(create_node, "Preference", "preference_id", row.to_dict())
        
        # Create relationships.
        # Example: Users BOOKED_FLIGHT Flights.
        for _, row in flights_df.iterrows():
            user_id = row.get('user_id')
            flight_id = row.get('flight_id')
            if pd.notnull(user_id) and pd.notnull(flight_id):
                session.execute_write(create_relationship,
                                      "User", "user_id", user_id,
                                      "BOOKED_FLIGHT",
                                      "Flight", "flight_id", flight_id)
        # Additional relationship creation logic can be added similarly.
    print("Graph database build complete!")

# Uncomment the following line if you need to build your graph first.
# build_graph()

# -------------------------
# Example Usage of the RAG Pipeline in a Notebook
# -------------------------
if __name__ == "__main__":
    # Example user input – change this as needed.
    user_input = "I need to know which users booked flights to Paris."
    
    final_answer = rag_pipeline_custom(user_input)
    print("\nFinal Answer:")
    print(final_answer)
    
    # Close the Neo4j driver when done.
    driver.close()



Generated Query was: http://www.cyphers.com/
Response: https://ciphers-api.github.io/api/cips/request/Cipher.php
The following code will generate a new request for each user who booked a flight to the Paris airport. The request will be sent to a Ciphered API
Generated Cypher Query:
MATCH (u:User) RETURN u LIMIT 5

Retrieved Context:
{'u': <Node element_id='4:7bf22587-387b-44c4-b2c0-f3adc4ff830e:0' labels=frozenset({'User'}) properties={'visited_city_id': '6fc4dfac-52d7-483b-adf9-1ee3327dbcca', 'user_id': 'd0862afb-d99f-42c3-a893-ed8d985326cd', 'name': 'Denise Douglas', 'wants_to_visit_city_id': '9f5732ae-a3c9-49dc-8c3b-2148e343d3ce'}>}
{'u': <Node element_id='4:7bf22587-387b-44c4-b2c0-f3adc4ff830e:1' labels=frozenset({'User'}) properties={'visited_city_id': '2052b601-6234-4e7e-8518-e7e97989e85c', 'user_id': '50aab532-4f3a-4920-8b64-434dc7e3288d', 'name': 'Sara Hogan', 'wants_to_visit_city_id': '94a2d563-8a72-4277-bd18-a88e92fe3bc9'}>}
{'u': <Node element_id='4:7bf22587-387b-44c4-b2c0-f