In [2]:
from neo4j import GraphDatabase
import pandas as pd
import logging
from typing import List, Dict
from pathlib import Path

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [5]:
class Neo4jMigrator:

    def __init__(self, uri: str, user: str, password: str):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
        
    def close(self):
        self.driver.close()
    
    # def clear_database(self):
    #     with self.driver.session() as session:
    #         session.run("MATCH (n) DETACH DELETE n")
    #         logger.info("Database cleared")
    
    def create_constraints(self):
        constraints = [
            "CREATE CONSTRAINT activity_id IF NOT EXISTS FOR (a:Activity) REQUIRE a.activityId IS UNIQUE",
            "CREATE CONSTRAINT asset_id IF NOT EXISTS FOR (a:Asset) REQUIRE a.assetId IS UNIQUE",
            "CREATE CONSTRAINT location_id IF NOT EXISTS FOR (l:Location) REQUIRE l.locationId IS UNIQUE",
            "CREATE CONSTRAINT request_id IF NOT EXISTS FOR (s:ServiceRequest) REQUIRE s.requestId IS UNIQUE"
        ]
        
        with self.driver.session() as session:
            for constraint in constraints:
                try:
                    session.run(constraint)
                    logger.info(f"Created constraint: {constraint.split('FOR')[1].split('REQUIRE')[0].strip()}")
                except Exception as e:
                    logger.warning(f"Constraint may already exist: {e}")
    
    def create_indexes(self):
        """Create indexes for better query performance."""
        indexes = [
            # "CREATE INDEX IF NOT EXISTS FOR (a:Asset) ON (a.assetId)",

            "CREATE INDEX IF NOT EXISTS FOR (s:ServiceRequest) ON (s.createdYear)",
            "CREATE INDEX IF NOT EXISTS FOR (s:ServiceRequest) ON (s.isCompleted)",
            "CREATE INDEX IF NOT EXISTS FOR (s:ServiceRequest) ON (s.createdMonth)",
            "CREATE INDEX IF NOT EXISTS FOR (s:ServiceRequest) ON (s.sla)",
            # "CREATE INDEX IF NOT EXISTS FOR (s:ServiceRequest) ON (s.requestId)",
            "CREATE INDEX IF NOT EXISTS FOR (s:ServiceRequest) ON (s.serviceClassificationId)" ,

            "CREATE INDEX IF NOT EXISTS FOR (c:Customer) ON (c.customer)",
            
            # "CREATE INDEX IF NOT EXISTS FOR (l:Location) ON (l.locationId),"

            # "CREATE INDEX IF NOT EXISTS FOR (ac:Activity) ON (ac.activityId),"

            "CREATE INDEX IF NOT EXISTS FOR (cn:Country) ON (cn.country)"

        ]
        
        with self.driver.session() as session:
            for index in indexes:
                try:
                    session.run(index)
                    logger.info(f"Created index: {index}")
                except Exception as e:
                    logger.warning(f"Index may already exist: {e}")
    
    def load_nodes_from_csv(self, csv_path: str, node_label: str, id_property: str, batch_size: int = 1000):
        """Load nodes from CSV file in batches."""
        df = pd.read_csv(csv_path)
        df = df.where(pd.notnull(df), None)  # Replace NaN with None
        
        total_rows = len(df)
        logger.info(f"Loading {total_rows} {node_label} nodes from {csv_path}")
        
        with self.driver.session() as session:
            for i in range(0, total_rows, batch_size):
                batch = df.iloc[i:i+batch_size]
                records = batch.to_dict('records')
                
                # Build Cypher query dynamically
                query = f"""
                UNWIND $records AS record
                MERGE (n:{node_label} {{{id_property}: record.{id_property}}})
                SET n += record
                """
                
                session.run(query, records=records)
                logger.info(f"Loaded batch {i//batch_size + 1}/{(total_rows-1)//batch_size + 1} for {node_label}")
        
        logger.info(f"Completed loading {node_label} nodes")
    
    def load_relationships_from_csv(self, csv_path: str, rel_config: Dict, batch_size: int = 1000):
        """
        Load relationships from CSV file.
        
        rel_config example:
        {
            'rel_type': 'LOCATED_AT',
            'from_label': 'Asset',
            'from_id_col': 'assetId',
            'from_id_prop': 'assetId',
            'to_label': 'Location',
            'to_id_col': 'locationId',
            'to_id_prop': 'locationId',
            'properties': []  # Optional: list of relationship properties
        }
        """
        df = pd.read_csv(csv_path)
        df = df.where(pd.notnull(df), None)
        
        total_rows = len(df)
        logger.info(f"Loading {total_rows} {rel_config['rel_type']} relationships from {csv_path}")
        
        with self.driver.session() as session:
            for i in range(0, total_rows, batch_size):
                batch = df.iloc[i:i+batch_size]
                records = batch.to_dict('records')
                
                # Build relationship properties string if any
                rel_props = ""
                if rel_config.get('properties'):
                    props_str = ", ".join([f"{p}: record.{p}" for p in rel_config['properties']])
                    rel_props = f" {{{props_str}}}"
                
                query = f"""
                UNWIND $records AS record
                MATCH (from:{rel_config['from_label']} {{{rel_config['from_id_prop']}: record.{rel_config['from_id_col']}}})
                MATCH (to:{rel_config['to_label']} {{{rel_config['to_id_prop']}: record.{rel_config['to_id_col']}}})
                MERGE (from)-[r:{rel_config['rel_type']}]->(to)
                """
                
                if rel_props:
                    query += f"\nSET r += {{{', '.join([f'{p}: record.{p}' for p in rel_config['properties']])}}}"
                
                session.run(query, records=records)
                logger.info(f"Loaded batch {i//batch_size + 1}/{(total_rows-1)//batch_size + 1} for {rel_config['rel_type']}")
        
        logger.info(f"Completed loading {rel_config['rel_type']} relationships")
    
    def verify_load(self):
        """Verify the data load by counting nodes and relationships."""
        with self.driver.session() as session:
            # Count nodes
            node_labels = ['Activity', 'Asset', 'Country', 'Customer', 'Location', 'ServiceRequest']
            for label in node_labels:
                result = session.run(f"MATCH (n:{label}) RETURN count(n) as count")
                count = result.single()['count']
                logger.info(f"{label} nodes: {count}")
            
            # Count relationships
            result = session.run("MATCH ()-[r]->() RETURN type(r) as type, count(r) as count")
            for record in result:
                logger.info(f"{record['type']} relationships: {record['count']}")



In [None]:

def main():
    # Configuration
    NEO4J_URI = ""
    NEO4J_USER = ""
    NEO4J_PASSWORD = ""
    
    # CSV file paths
    DATA_DIR = Path("./neo4j_data")
    REL_DIR = Path("./neo4j_relationships")
    
    # Initialize loader
    loader = Neo4jMigrator(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)
    
    try:
        # Optional: Clear existing data
        # loader.clear_database()
        
        # Create constraints and indexes
        loader.create_constraints()
        loader.create_indexes()
        
        # Load nodes
        logger.info("=" * 50)
        logger.info("LOADING NODES")
        logger.info("=" * 50)
        
        loader.load_nodes_from_csv(
            f"{DATA_DIR}/activities.csv", 
            "Activity", 
            "activityId"
        )
        
        loader.load_nodes_from_csv(
            f"{DATA_DIR}/assets.csv", 
            "Asset", 
            "assetId"
        )
        
        loader.load_nodes_from_csv(
            f"{DATA_DIR}/countries.csv", 
            "Country", 
            "country"
        )
        
        loader.load_nodes_from_csv(
            f"{DATA_DIR}/customers.csv", 
            "Customer", 
            "customer"
        )
        
        loader.load_nodes_from_csv(
            f"{DATA_DIR}/location.csv", 
            "Location", 
            "locationId"
        )
        
        loader.load_nodes_from_csv(
            f"{DATA_DIR}/service_requests.csv", 
            "ServiceRequest", 
            "requestId"
        )
        
        # Load relationships
        logger.info("=" * 50)
        logger.info("LOADING RELATIONSHIPS")
        logger.info("=" * 50)
        
        # Asset -> Location
        loader.load_relationships_from_csv(
            f"{REL_DIR}/LOCATED_AT.csv",
            {
                'rel_type': 'LOCATED_AT',
                'from_label': 'Asset',
                'from_id_col': 'assetId',
                'from_id_prop': 'assetId',
                'to_label': 'Location',
                'to_id_col': 'locationId',
                'to_id_prop': 'locationId'
            }
        )
        
        # ServiceRequest -> Location
        loader.load_relationships_from_csv(
            f"{REL_DIR}/AT_LOCATION.csv",
            {
                'rel_type': 'AT_LOCATION',
                'from_label': 'ServiceRequest',
                'from_id_col': 'requestId',
                'from_id_prop': 'requestId',
                'to_label': 'Location',
                'to_id_col': 'locationId',
                'to_id_prop': 'locationId'
            }
        )
        
        # ServiceRequest -> Activity
        loader.load_relationships_from_csv(
            f"{REL_DIR}/HAS_ACTIVITY.csv",
            {
                'rel_type': 'HAS_ACTIVITY',
                'from_label': 'ServiceRequest',
                'from_id_col': 'requestId',
                'from_id_prop': 'requestId',
                'to_label': 'Activity',
                'to_id_col': 'activityId',
                'to_id_prop': 'activityId'
            }
        )
        
        # ServiceRequest -> Asset
        loader.load_relationships_from_csv(
            f"{REL_DIR}/FOR_ASSET.csv",
            {
                'rel_type': 'FOR_ASSET',
                'from_label': 'ServiceRequest',
                'from_id_col': 'requestId',
                'from_id_prop': 'requestId',
                'to_label': 'Asset',
                'to_id_col': 'assetId',
                'to_id_prop': 'assetId'
            }
        )
        
        # Customer -> Country
        loader.load_relationships_from_csv(
            f"{REL_DIR}/OPERATES_IN.csv",
            {
                'rel_type': 'OPERATES_IN',
                'from_label': 'Customer',
                'from_id_col': 'customer',
                'from_id_prop': 'customer',
                'to_label': 'Country',
                'to_id_col': 'country',
                'to_id_prop': 'country'
            }
        )
        
        # Customer -> Location
        loader.load_relationships_from_csv(
            f"{REL_DIR}/RESIDES_AT.csv",
            {
                'rel_type': 'RESIDES_AT',
                'from_label': 'Customer',
                'from_id_col': 'customer',
                'from_id_prop': 'customer',
                'to_label': 'Location',
                'to_id_col': 'locationId',
                'to_id_prop': 'locationId'
            }
        )
        
        # Customer -> Asset
        loader.load_relationships_from_csv(
            f"{REL_DIR}/OWNS.csv",
            {
                'rel_type': 'OWNS',
                'from_label': 'Customer',
                'from_id_col': 'customer',
                'from_id_prop': 'customer',
                'to_label': 'Asset',
                'to_id_col': 'assetId',
                'to_id_prop': 'assetId'
            }
        )
        
        # Customer -> ServiceRequest
        loader.load_relationships_from_csv(
            f"{REL_DIR}/CREATES.csv",
            {
                'rel_type': 'CREATES',
                'from_label': 'Customer',
                'from_id_col': 'customer',
                'from_id_prop': 'customer',
                'to_label': 'ServiceRequest',
                'to_id_col': 'requestId',
                'to_id_prop': 'requestId'
            }
        )
        
        # Location -> Country
        loader.load_relationships_from_csv(
            f"{REL_DIR}/IN.csv",
            {
                'rel_type': 'IN',
                'from_label': 'Location',
                'from_id_col': 'locationId',
                'from_id_prop': 'locationId',
                'to_label': 'Country',
                'to_id_col': 'country',
                'to_id_prop': 'country'
            }
        )
        
        # Verify the load
        logger.info("=" * 50)
        logger.info("VERIFICATION")
        logger.info("=" * 50)
        loader.verify_load()
        
        logger.info("Data load completed successfully!")
        
    finally:
        loader.close()


if __name__ == "__main__":
    main()

INFO:__main__:VERIFICATION
INFO:__main__:Activity nodes: 352489
INFO:__main__:Asset nodes: 508867
INFO:__main__:Country nodes: 60
INFO:__main__:Customer nodes: 50
INFO:__main__:Location nodes: 35934
INFO:__main__:ServiceRequest nodes: 371166
INFO:__main__:LOCATES_AT relationships: 155704
INFO:__main__:LOCATED_AT relationships: 41487
INFO:__main__:HAPPENS_AT relationships: 442508
INFO:__main__:HAS_ACTIVITY relationships: 348553
INFO:__main__:REALTED_TO relationships: 170869
INFO:__main__:OPERATES_IN relationships: 197
INFO:__main__:RESIDES_AT relationships: 15132
INFO:__main__:OWNS relationships: 210541
INFO:__main__:CREATES relationships: 442507
INFO:__main__:IN relationships: 15132
INFO:__main__:Data load completed successfully!
