In [1]:
import pandas as pd
# import sqlalchemy
from sqlalchemy import create_engine
import urllib.parse

import configparser
# import os
from pathlib import Path
import numpy as np

from neo4j import GraphDatabase
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
from typing import Dict

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
class DataLoader:

    def __init__(self, config_file_path):

        if not Path(config_file_path).exists():
            logger.warning(f"Config file {config_file_path} not found!")
        
        config = configparser.ConfigParser()
        config.read(config_file_path)

        if 'DATABASE' not in config:
            raise ValueError("DATABASE section not found in config")
        
        db_config = {
            'host': config['DATABASE']['host'],
            'port': int(config['DATABASE']['port']),
            'username': config['DATABASE']['username'],
            'password': config['DATABASE']['password'],
            'database': config['DATABASE']['database'],
            'query_request': config['DATABASE']['query1'],
            'query_assets': config['DATABASE']['query2'],
            'query_request_with_activities': config['DATABASE']['query3'],
            'schema': config['DATABASE']['schema']
        }

        self.db_host = db_config.get('host')
        self.db_port = db_config.get('port')
        self.db_username = db_config.get('username')
        self.db_password = db_config.get('password')
        self.db_database = db_config.get('database')
        self.db_query1 = db_config.get('query_request')
        self.db_query2 = db_config.get('query_assets')
        self.db_query3 = db_config.get('query_request_with_activities')
        self.db_schema = db_config.get('schema')   
            
    def executor(self):
        self.conn_string = self.database_connector(
            db_type='postgresql',
            host=self.db_host,
            port=self.db_port,
            database=self.db_database,
            username=self.db_username,
            password=self.db_password,
            schema=self.db_schema
            )
        
        logger.info("=" * 50)
        logger.info("LOADING DATA")
        logger.info("=" * 50)

        self.load_and_save_data()
        self.load_CSVs()
        self.data_preprocessor()
        self.save_neo4j_CSVs()
        self.create_and_save_relationships()

        logger.info("=" * 50)
        logger.info("DATA LOADING SUCCESSFUL!")
        logger.info("=" * 50)


    def database_connector(self, db_type, host, port, database, username, password, **kwargs):
    
        encoded_password = urllib.parse.quote_plus(password)
        connection_strings = {
            'postgresql': f"postgresql://{username}:{encoded_password}@{host}:{port}/{database}",
        }
        return connection_strings[db_type]


    def load_and_save_data(self):
        
        target_dir_path = './fetched_data'
        Path(target_dir_path).mkdir(parents=True, exist_ok=True)

        try:
            engine = create_engine(self.conn_string)
            
            if not self.db_query1:
                logger.warning("Query for v_request is missing!")
                return None 
        
            if not self.db_query2:
                logger.warning("Query for v_assets is missing!")
                return None
            
            if not self.db_query3:
                logger.warning("Query for v_request_with_activities is missing!")
                return None

            self.df_request = pd.read_sql(self.db_query1, engine)
            self.df_assets = pd.read_sql(self.db_query2, engine)
            self.df_request_with_activities = pd.read_sql(self.db_query3, engine)
            
            logger.info(f" Downloaded {len(self.df_request)} rows from 'v_requests', {len(self.df_assets)} rows from 'v_assets', and {len(self.df_request_with_activities)} rows from 'v_request_with_activities'.")
            
            self.df_request.to_csv(f"{target_dir_path}/v_requests.csv",index=False)
            self.df_assets.to_csv(f"{target_dir_path}/v_assets.csv",index=False)
            self.df_request_with_activities.to_csv(f"{target_dir_path}/v_requests_with_activities.csv",index=False)
            
            logger.info(f"Data exported to {target_dir_path}")
            
        except Exception as e:
            logger.warning(f"Error connecting to database: {e}")
            return None
        
        finally:
            if 'engine' in locals():
                engine.dispose()

    def create_and_save_relationships(self):

        neo4j_relationship_dir_path = './neo4j_relationships'
        Path(neo4j_relationship_dir_path).mkdir(parents=True, exist_ok=True)
        
        try:

            self.LOCATED_AT = self.df_request[['assetAlternateId','locationAlternateId']].dropna().drop_duplicates()
            self.LOCATED_AT.rename(columns={'assetAlternateId': 'assetId', 'locationAlternateId': 'locationId'}, inplace=True)
            self.LOCATED_AT.to_csv(f"{neo4j_relationship_dir_path}/LOCATED_AT.csv",index=False)

            self.AT_LOCATION = self.df_request[['requestAlternateId','locationAlternateId']].dropna().drop_duplicates()
            self.AT_LOCATION.rename(columns={'requestAlternateId': 'requestId', 'locationAlternateId': 'locationId'}, inplace=True)
            self.AT_LOCATION.to_csv(f"{neo4j_relationship_dir_path}/AT_LOCATION.csv",index=False)

            self.HAS_ACTIVITY = self.df_request_with_activities[['activityAlternateId', 'requestAlternateId']].dropna().drop_duplicates()
            self.HAS_ACTIVITY.rename(columns={'requestAlternateId': 'requestId', 'activityAlternateId': 'activityId'}, inplace=True)
            self.HAS_ACTIVITY.to_csv(f"{neo4j_relationship_dir_path}/HAS_ACTIVITY.csv",index=False)

            self.FOR_ASSET = self.df_request[['requestAlternateId','assetAlternateId']].dropna().drop_duplicates()
            self.FOR_ASSET.rename(columns={'requestAlternateId': 'requestId', 'assetAlternateId': 'assetId'}, inplace=True)
            self.FOR_ASSET.to_csv(f"{neo4j_relationship_dir_path}/FOR_ASSET.csv",index=False)

            self.OPERATES_IN = self.df_request[['customer','country']].dropna().drop_duplicates()
            self.OPERATES_IN.to_csv(f"{neo4j_relationship_dir_path}/OPERATES_IN.csv",index=False)

            self.RESIDES_AT = self.df_request[['customer','locationAlternateId']].dropna().drop_duplicates()
            self.RESIDES_AT.rename(columns={'locationAlternateId': 'locationId'}, inplace=True)
            self.RESIDES_AT.to_csv(f"{neo4j_relationship_dir_path}/RESIDES_AT.csv",index=False)

            self.OWNS = self.df_request[['customer','assetAlternateId']].dropna().drop_duplicates()
            self.OWNS.rename(columns={'assetAlternateId': 'assetId'}, inplace=True)
            self.OWNS.to_csv(f"{neo4j_relationship_dir_path}/OWNS.csv",index=False)

            self.CREATES = self.df_request[['customer','requestAlternateId']].dropna().drop_duplicates()
            self.CREATES.rename(columns={'requestAlternateId': 'requestId'}, inplace=True)
            self.CREATES.to_csv(f"{neo4j_relationship_dir_path}/CREATES.csv",index=False)

            self.IN = self.df_request[['country','locationAlternateId']].dropna().drop_duplicates()
            self.IN.rename(columns={'locationAlternateId': 'locationId'}, inplace=True)
            self.IN.to_csv(f"{neo4j_relationship_dir_path}/IN.csv",index=False)

            logger.info(f"Relationships created and saved to path: {neo4j_relationship_dir_path}")

        except Exception as e:

            logger.warning(f"Error while creating and saving relationships: {e}")



    def load_CSVs(self):
        self.is_hvac_df = pd.read_csv('./data/hvac_assets/IFM_Assets_RuleBasedEngineResults(IFM_Assets_RuleBasedEngineResul).csv')
        self.suggested_asset_df = pd.read_csv('./data/asset_suggest_data/asset_suggest_model.csv')
        self.vendor_data = pd.read_csv('./data/asset_vendor/request_act_vendor.csv')

        logger.info("Helper CSV files loaded successfully.")
        
    
    def data_preprocessor(self):

        # Activity:
        activity_df = self.df_request_with_activities[self.df_request_with_activities['activityAlternateId'].notna()][['providertype','activityAlternateId','activityDescription']]
        activity_df.drop_duplicates(inplace = True)

        # Asset:
        requests_subset = self.df_request[['requestId','assetAlternateId', 'requestAlternateId']]
        requests_subset = requests_subset[requests_subset.assetAlternateId.notna()]

        v_assets = self.df_assets[['assetId','Asset Alt Id', 'Asset Description', 'manufacturer', 'model', 'serialNumber']]
        v_assets = v_assets.merge(requests_subset, left_on = 'Asset Alt Id', right_on = 'assetAlternateId', how= 'left')
        v_assets = v_assets[v_assets['requestAlternateId'].notna()] # keeping only those asset records which are associated to the presently fetched serviceRequests

        is_hvac_df = self.is_hvac_df.copy()
        is_hvac_df['is_HVAC'] = True
        is_hvac_df.drop(columns=['Asset Description'], inplace = True)
        v_assets_with_hvac = v_assets.merge(is_hvac_df, on='Asset Alt Id', how='left')
        # v_assets_with_hvac.loc[v_assets_with_hvac['is_HVAC'] == True, 'asset_type'] = 'HVAC'
        
        if not v_assets_with_hvac.empty:
            v_assets_with_hvac.loc[v_assets_with_hvac['is_HVAC'] == True, 'asset_type'] = 'HVAC'
        else:
            v_assets_with_hvac['asset_type'] = None

        final_assets_df = v_assets_with_hvac[['assetId', 'Asset Description', 'Asset Alt Id', 'manufacturer', 'model',
                                              'serialNumber', 'is_HVAC', 'asset_type', 'requestId','assetAlternateId', 'requestAlternateId']]
        final_assets_df.loc[:, 'is_HVAC'] = final_assets_df['is_HVAC'].fillna(False)

        suggested_asset_df = self.suggested_asset_df.copy()
        suggested_asset_df.rename(columns={'asset_id': 'suggested_asset'}, inplace=True)
        suggested_asset_df_subset = suggested_asset_df[['request_id', 'suggested_asset']]

        final_assets_df = final_assets_df.merge(suggested_asset_df_subset, left_on = 'requestId', right_on = 'request_id', how = 'left')
        final_assets_df = final_assets_df[['assetId', 'Asset Description', 'Asset Alt Id', 'manufacturer', 'model',
                                           'serialNumber', 'is_HVAC', 'asset_type', 'suggested_asset','requestAlternateId']]
        
        vendor_data = self.vendor_data.copy()
        vendor_data = vendor_data[['requestAlternateId','vendorName', 'vendorAddress1', 'vendorCity',
                           'vendorRegion', 'vendorCountry', 'vendorPostalCode']]
        assets_with_vendors = final_assets_df.merge(vendor_data, on = 'requestAlternateId', how = 'left')
        assets_df = assets_with_vendors[['Asset Description', 'Asset Alt Id', 'manufacturer', 'model','serialNumber', 
                                                   'is_HVAC', 'asset_type', 'suggested_asset','vendorName', 'vendorAddress1', 'vendorCity',
                                                   'vendorRegion', 'vendorCountry', 'vendorPostalCode']]

        # Country:
        country_df = self.df_request[['country']].drop_duplicates()

        # Customer:
        customer_df = self.df_request[['customer']].drop_duplicates()

        # Location:
        location_df = self.df_request[['locationAlternateId', 'locationPath']].drop_duplicates()

        # Service Requests:
        temp_ser_req = self.df_request[['isSelfAssign', 'priorityCode', 
                  'requestCreatedDate', 'requestDescription', 'requestAlternateId', 'completionNotes', 
                  'requestTargetCompletionDate', 'serviceClassificationAlternateId', 'serviceClassificationPath',  
                  'requestCompletionDate', 'workType']]
        
        def to_local_datetime(date_col):
    
            if date_col is None:
                return None
            
            if date_col.isna().all():
                return date_col
            
            dt_series = pd.to_datetime(date_col, format='mixed')
            formatted = dt_series.dt.strftime('%Y-%m-%d %H:%M:%S.%f').str[:-3]
            formatted_transformed = formatted.str.replace(' ', 'T')

            return formatted_transformed


        def process_service_requests(df_service_request):
                
                date_cols = ['requestCreatedDate', 'requestTargetCompletionDate', 'requestCompletionDate']
                
                for col in date_cols:
                    if col in df_service_request.columns:
                        # df_service_request.loc[:, col] = to_local_datetime(df_service_request[col]).astype(str) # not working- pandas replaces 'T' with a ' ' again
                        df_service_request[col] = to_local_datetime(df_service_request[col])
                
                df_service_request['createdYear'] = pd.to_datetime(df_service_request['requestCreatedDate']).dt.year
                df_service_request['createdMonth'] = pd.to_datetime(df_service_request['requestCreatedDate']).dt.month
                
                
                df_service_request['isCompleted'] = df_service_request['requestCompletionDate'].notna()
                
                
                conditions = [
                    df_service_request['requestCompletionDate'].isna(),
                    df_service_request['requestTargetCompletionDate'].isna(),
                    df_service_request['requestCompletionDate'] <= df_service_request['requestTargetCompletionDate'],
                    df_service_request['requestCompletionDate'] > df_service_request['requestTargetCompletionDate']
                ]
                
                choices = ['Open', 'Open', 'Met', 'Miss']
                
                df_service_request['sla'] = np.select(conditions, choices, default='Unknown')
                
                return df_service_request

        
        service_req_df = process_service_requests(temp_ser_req)

        self.activity_df = activity_df.copy()
        self.assets_df = assets_df.copy()
        self.country_df = country_df.copy()
        self.customer_df = customer_df.copy()
        self.location_df = location_df.copy()
        self.service_req_df = service_req_df.copy()
        logger.info("Node and Property data prepared!")

    def save_neo4j_CSVs(self):
        neo4j_dir_path = './neo4j_data'
        Path(neo4j_dir_path).mkdir(parents=True, exist_ok=True)
        
        try:
            # renaming the features:
            self.activity_df.rename(columns={'activityAlternateId': 'activityId', 'providertype':'providerType'}, inplace=True)

            self.assets_df.rename(columns={'Asset Alt Id': 'assetId', 'Asset Description':'assetDescription', 'vendorAddress1':'vendorAddress'}, inplace=True)

            self.location_df.rename(columns={'locationAlternateId': 'locationId'}, inplace=True)

            self.service_req_df.rename(columns={'requestAlternateId': 'requestId', 'serviceClassificationAlternateId': 'serviceClassificationId'}, inplace=True)

            # saving the data
            self.activity_df.to_csv(f"{neo4j_dir_path}/activities.csv",index=False)
            self.assets_df.to_csv(f"{neo4j_dir_path}/assets.csv",index=False)
            self.country_df.to_csv(f"{neo4j_dir_path}/countries.csv",index=False)
            self.customer_df.to_csv(f"{neo4j_dir_path}/customers.csv",index=False)
            self.location_df.to_csv(f"{neo4j_dir_path}/location.csv",index=False)
            self.service_req_df.to_csv(f"{neo4j_dir_path}/service_requests.csv",index=False)

            logger.info(f"Data for migration to Neo4J is saved on path: {neo4j_dir_path} and ready to be imported!")
        except Exception as e:
            logger.warning(f"Error saving CSVs: {e}")
    


In [4]:
class DataMigrator:

    def __init__(self, config_file_path):

        if not Path(config_file_path).exists():
            logger.warning(f"Config file {config_file_path} not found!")
        
        config = configparser.ConfigParser()
        config.read(config_file_path)

        if 'Neo4j' not in config:
            raise ValueError("Neo4j section not found in config")
        
        nj_config = {
            'url': config['Neo4j']['url'],
            'username': config['Neo4j']['username'],
            'password': config['Neo4j']['password']
        }

        self.nj_url = nj_config.get('url')
        self.nj_username = nj_config.get('username')
        self.nj_password = nj_config.get('password')

        self.driver = GraphDatabase.driver(self.nj_url, auth=(self.nj_username, self.nj_password))
        
        # self.executor()

    def close(self):
        self.driver.close()
    
    def clear_database(self):

        with self.driver.session() as session:

            # Delete all nodes and relationships:
            session.run("MATCH (n) DETACH DELETE n")
            logger.info("All nodes and relationships deleted")
            
            # Delete all indexes:
            result = session.run("SHOW INDEXES")
            for record in result:
                index_name = record.get("name") or record.get("indexName")
                if index_name:
                    try:
                        session.run(f"DROP INDEX {index_name}")
                        logger.info(f"Dropped index: {index_name}")
                    except Exception as e:
                        logger.warning(f"Could not drop index {index_name}: {e}")
            
            # Delete all constraints:
            result = session.run("SHOW CONSTRAINTS")
            for record in result:
                constraint_name = record.get("name")
                if constraint_name:
                    try:
                        session.run(f"DROP CONSTRAINT {constraint_name}")
                        logger.info(f"Dropped constraint: {constraint_name}")
                    except Exception as e:
                        logger.warning(f"Could not drop constraint {constraint_name}: {e}")
            
            logger.info("Database completely cleared")

    def create_constraints(self):
        constraints = [
            "CREATE CONSTRAINT  IF NOT EXISTS FOR (a:Activity) REQUIRE a.activityId IS UNIQUE",
            "CREATE CONSTRAINT  IF NOT EXISTS FOR (as:Asset) REQUIRE as.assetId IS UNIQUE",
            "CREATE CONSTRAINT  IF NOT EXISTS FOR (l:Location) REQUIRE l.locationId IS UNIQUE",
            "CREATE CONSTRAINT  IF NOT EXISTS FOR (s:ServiceRequest) REQUIRE s.requestId IS UNIQUE",
            "CREATE CONSTRAINT  IF NOT EXISTS FOR (c:Country) REQUIRE c.country IS UNIQUE",
            "CREATE CONSTRAINT  IF NOT EXISTS FOR (cu:Customer) REQUIRE cu.customer IS UNIQUE",
            "CREATE CONSTRAINT  IF NOT EXISTS FOR (s:ServiceRequest) REQUIRE s.serviceClassificationId IS UNIQUE"
        ]
        
        with self.driver.session() as session:
            for constraint in constraints:
                try:
                    session.run(constraint)
                    logger.info(f"Created constraint: {constraint.split('FOR')[1].split('REQUIRE')[0].strip()}")
                except Exception as e:
                    logger.warning(f"Constraint may already exist: {e}")


    def create_indexes(self):
        
        indexes = [
            # "CREATE INDEX IF NOT EXISTS FOR (a:Asset) ON (a.assetId)",

            "CREATE INDEX IF NOT EXISTS FOR (s:ServiceRequest) ON (s.createdYear)",
            "CREATE INDEX IF NOT EXISTS FOR (s:ServiceRequest) ON (s.isCompleted)",
            "CREATE INDEX IF NOT EXISTS FOR (s:ServiceRequest) ON (s.createdMonth)",
            "CREATE INDEX IF NOT EXISTS FOR (s:ServiceRequest) ON (s.sla)",
            # "CREATE INDEX IF NOT EXISTS FOR (s:ServiceRequest) ON (s.requestId)",
            # "CREATE INDEX IF NOT EXISTS FOR (s:ServiceRequest) ON (s.serviceClassificationId)" ,

            # "CREATE INDEX IF NOT EXISTS FOR (c:Customer) ON (c.customer)",
            
            # "CREATE INDEX IF NOT EXISTS FOR (l:Location) ON (l.locationId),"

            # "CREATE INDEX IF NOT EXISTS FOR (ac:Activity) ON (ac.activityId),"

            # "CREATE INDEX IF NOT EXISTS FOR (cn:Country) ON (cn.country)"

        ]
        
        with self.driver.session() as session:
            for index in indexes:
                try:
                    session.run(index)
                    logger.info(f"Created index: {index}")
                except Exception as e:
                    logger.warning(f"Index may already exist: {e}")


    def load_nodes_from_csv(self, csv_path: str, node_label: str, id_property: str, batch_size: int = 1000):
        """Load nodes from CSV file in batches."""
        df = pd.read_csv(csv_path)
        df = df.where(pd.notnull(df), None)  # Replace NaN with None
        
        total_rows = len(df)
        logger.info(f"Loading {total_rows} {node_label} nodes from {csv_path}")
        
        with self.driver.session() as session:
            for i in range(0, total_rows, batch_size):
                batch = df.iloc[i:i+batch_size]
                records = batch.to_dict('records')
                
                # Build Cypher query dynamically
                query = f"""
                UNWIND $records AS record
                MERGE (n:{node_label} {{{id_property}: record.{id_property}}})
                SET n += record
                """
                
                session.run(query, records=records)
                logger.info(f"Loaded batch {i//batch_size + 1}/{(total_rows-1)//batch_size + 1} for {node_label}")
        
        logger.info(f"Completed loading {node_label} nodes")


    def load_relationships_from_csv(self, csv_path: str, rel_config: Dict, batch_size: int = 1000):
        """
        Load relationships from CSV file.
        
        rel_config example:
        {
            'rel_type': 'LOCATED_AT',
            'from_label': 'Asset',
            'from_id_col': 'assetId',
            'from_id_prop': 'assetId',
            'to_label': 'Location',
            'to_id_col': 'locationId',
            'to_id_prop': 'locationId',
            'properties': []  # Optional: list of relationship properties
        }
        """
        df = pd.read_csv(csv_path)
        df = df.where(pd.notnull(df), None)
        
        total_rows = len(df)
        logger.info(f"Loading {total_rows} {rel_config['rel_type']} relationships from {csv_path}")
        
        with self.driver.session() as session:
            for i in range(0, total_rows, batch_size):
                batch = df.iloc[i:i+batch_size]
                records = batch.to_dict('records')
                
                # Build relationship properties string if any
                rel_props = ""
                if rel_config.get('properties'):
                    props_str = ", ".join([f"{p}: record.{p}" for p in rel_config['properties']])
                    rel_props = f" {{{props_str}}}"
                
                query = f"""
                UNWIND $records AS record
                MATCH (from:{rel_config['from_label']} {{{rel_config['from_id_prop']}: record.{rel_config['from_id_col']}}})
                MATCH (to:{rel_config['to_label']} {{{rel_config['to_id_prop']}: record.{rel_config['to_id_col']}}})
                MERGE (from)-[r:{rel_config['rel_type']}]->(to)
                """
                
                if rel_props:
                    query += f"\nSET r += {{{', '.join([f'{p}: record.{p}' for p in rel_config['properties']])}}}"
                
                session.run(query, records=records)
                logger.info(f"Loaded batch {i//batch_size + 1}/{(total_rows-1)//batch_size + 1} for {rel_config['rel_type']}")
        
        logger.info(f"Completed loading {rel_config['rel_type']} relationships")
    

    def verify_load(self):
        """Verify the data load by counting nodes and relationships."""
        with self.driver.session() as session:
            # Count nodes
            node_labels = ['Activity', 'Asset', 'Country', 'Customer', 'Location', 'ServiceRequest']
            for label in node_labels:
                result = session.run(f"MATCH (n:{label}) RETURN count(n) as count")
                count = result.single()['count']
                logger.info(f"{label} nodes: {count}")
            
            # Count relationships
            result = session.run("MATCH ()-[r]->() RETURN type(r) as type, count(r) as count")
            for record in result:
                logger.info(f"{record['type']} relationships: {record['count']}")

    
    def executor(self):

        # # CAUTION: Deleting the existing graph!!!
        # logger.info("=" * 50)
        # logger.info("DELETING THE EXISTING GRAPH!!")
        # logger.info("=" * 50) 
        # self.clear_database()

        logger.info("=" * 50)
        logger.info("LOADING NODES")
        logger.info("=" * 50)        
        self.load_nodes()

        logger.info("=" * 50)
        logger.info("LOADING RELATIONSHIPS")
        logger.info("=" * 50)
        self.load_relationships()

        logger.info("=" * 50)
        logger.info("VERIFYING NODE AND RELATIONSHP CREATION.")
        logger.info("=" * 50)
        self.verify_load()

        logger.info("=" * 50)
        logger.info("DATA MIGRATION SUCCESSFUL!")
        logger.info("=" * 50)

        self.close()

    
    def load_nodes(self):
        
        DATA_DIR = Path("./neo4j_data")

        try:

            # Create constraints and indexes
            self.create_constraints()
            self.create_indexes()
            
            # Load nodes
            
            self.load_nodes_from_csv(
                f"{DATA_DIR}/activities.csv", 
                "Activity", 
                "activityId"
            )
            
            self.load_nodes_from_csv(
                f"{DATA_DIR}/assets.csv", 
                "Asset", 
                "assetId"
            )
            
            self.load_nodes_from_csv(
                f"{DATA_DIR}/countries.csv", 
                "Country", 
                "country"
            )
            
            self.load_nodes_from_csv(
                f"{DATA_DIR}/customers.csv", 
                "Customer", 
                "customer"
            )
            
            self.load_nodes_from_csv(
                f"{DATA_DIR}/location.csv", 
                "Location", 
                "locationId"
            )
            
            self.load_nodes_from_csv(
                f"{DATA_DIR}/service_requests.csv", 
                "ServiceRequest", 
                "requestId"
            )

        except Exception as e:
            logger.warning(f"Error creating Nodes in Neo4j: {e}")


    def load_relationships(self):
        
        REL_DIR = Path("./neo4j_relationships")

        try:
            # Load relationships
            
            # Asset -> Location
            self.load_relationships_from_csv(
                f"{REL_DIR}/LOCATED_AT.csv",
                {
                    'rel_type': 'LOCATED_AT',
                    'from_label': 'Asset',
                    'from_id_col': 'assetId',
                    'from_id_prop': 'assetId',
                    'to_label': 'Location',
                    'to_id_col': 'locationId',
                    'to_id_prop': 'locationId'
                }
            )
            
            # ServiceRequest -> Location
            self.load_relationships_from_csv(
                f"{REL_DIR}/AT_LOCATION.csv",
                {
                    'rel_type': 'AT_LOCATION',
                    'from_label': 'ServiceRequest',
                    'from_id_col': 'requestId',
                    'from_id_prop': 'requestId',
                    'to_label': 'Location',
                    'to_id_col': 'locationId',
                    'to_id_prop': 'locationId'
                }
            )
            
            # ServiceRequest -> Activity
            self.load_relationships_from_csv(
                f"{REL_DIR}/HAS_ACTIVITY.csv",
                {
                    'rel_type': 'HAS_ACTIVITY',
                    'from_label': 'ServiceRequest',
                    'from_id_col': 'requestId',
                    'from_id_prop': 'requestId',
                    'to_label': 'Activity',
                    'to_id_col': 'activityId',
                    'to_id_prop': 'activityId'
                }
            )
            
            # ServiceRequest -> Asset
            self.load_relationships_from_csv(
                f"{REL_DIR}/FOR_ASSET.csv",
                {
                    'rel_type': 'FOR_ASSET',
                    'from_label': 'ServiceRequest',
                    'from_id_col': 'requestId',
                    'from_id_prop': 'requestId',
                    'to_label': 'Asset',
                    'to_id_col': 'assetId',
                    'to_id_prop': 'assetId'
                }
            )
            
            # Customer -> Country
            self.load_relationships_from_csv(
                f"{REL_DIR}/OPERATES_IN.csv",
                {
                    'rel_type': 'OPERATES_IN',
                    'from_label': 'Customer',
                    'from_id_col': 'customer',
                    'from_id_prop': 'customer',
                    'to_label': 'Country',
                    'to_id_col': 'country',
                    'to_id_prop': 'country'
                }
            )
            
            # Customer -> Location
            self.load_relationships_from_csv(
                f"{REL_DIR}/RESIDES_AT.csv",
                {
                    'rel_type': 'RESIDES_AT',
                    'from_label': 'Customer',
                    'from_id_col': 'customer',
                    'from_id_prop': 'customer',
                    'to_label': 'Location',
                    'to_id_col': 'locationId',
                    'to_id_prop': 'locationId'
                }
            )
            
            # Customer -> Asset
            self.load_relationships_from_csv(
                f"{REL_DIR}/OWNS.csv",
                {
                    'rel_type': 'OWNS',
                    'from_label': 'Customer',
                    'from_id_col': 'customer',
                    'from_id_prop': 'customer',
                    'to_label': 'Asset',
                    'to_id_col': 'assetId',
                    'to_id_prop': 'assetId'
                }
            )
            
            # Customer -> ServiceRequest
            self.load_relationships_from_csv(
                f"{REL_DIR}/CREATES.csv",
                {
                    'rel_type': 'CREATES',
                    'from_label': 'Customer',
                    'from_id_col': 'customer',
                    'from_id_prop': 'customer',
                    'to_label': 'ServiceRequest',
                    'to_id_col': 'requestId',
                    'to_id_prop': 'requestId'
                }
            )
            
            # Location -> Country
            self.load_relationships_from_csv(
                f"{REL_DIR}/IN.csv",
                {
                    'rel_type': 'IN',
                    'from_label': 'Location',
                    'from_id_col': 'locationId',
                    'from_id_prop': 'locationId',
                    'to_label': 'Country',
                    'to_id_col': 'country',
                    'to_id_prop': 'country'
                }
            )

        except Exception as e:
            logger.warning(f"Error creating Relationships for nodes on Neo4j: {e}")
            

In [None]:

try:
    dataLoader = DataLoader('config.ini')
    dataLoader.executor()

except NameError:
    logger.warning("The DataLoader class is not defined. Please ensure it is defined correctly.")

try:
    dataMigrator = DataMigrator('config.ini')
    dataMigrator.executor()

except NameError:
    logger.warning("The DataMigrator class is not defined. Please ensure it is defined correctly.")


#to-do:
# --> define paths for csvs in config.ini -- cosmetic, later
# --> filter features from 'final_data' -- done
# --> create csvs/dfs for neo4j migration -- done
# --> migration!! -- on going

# Data_migrator('config.ini', target_file_path)

INFO:__main__:LOADING DATA
INFO:__main__: Downloaded 465659 rows from 'v_requests', 508883 rows from 'v_assets', and 481248 rows from 'v_request_with_activities'.
INFO:__main__:Data exported to ./fetched_data
INFO:__main__:Helper CSV files loaded successfully.
INFO:__main__:Node and Property data prepared!
INFO:__main__:Data for migration to Neo4J is saved on path: ./neo4j_data and ready to be imported!
INFO:__main__:Relationships created and saved to path: ./neo4j_relationships
INFO:__main__:DATA LOADING SUCCESSFUL!
INFO:__main__:LOADING NODES
INFO:__main__:Created constraint: (a:Activity)
INFO:neo4j.notifications:Received notification from DBMS server: <GqlStatusObject gql_status='00NA0', status_description="note: successful completion - index or constraint already exists. The command 'CREATE CONSTRAINT IF NOT EXISTS FOR (e:Activity) REQUIRE (e.activityId) IS UNIQUE' has no effect. The index or constraint specified by 'CONSTRAINT activityId_Activity_uniq FOR (e:Activity) REQUIRE (e.a

Bad pipe message: %s [b'J\xb9\xfcJ\x16\x85\x86j\xf5\xc0\xd6\x8b\xcc\xf0\x9a\xe0\xcb\x03\x00\x01|\x00\x00\x00\x01\x00\x02\x00\x03\x00\x04\x00\x05\x00\x06\x00\x07\x00\x08\x00\t\x00\n\x00\x0b\x00\x0c\x00\r\x00\x0e\x00\x0f\x00\x10\x00\x11\x00\x12\x00\x13\x00']
Bad pipe message: %s [b'\x15\x00\x16\x00\x17\x00\x18\x00\x19\x00\x1a\x00\x1b\x00/\x000\x001']
Bad pipe message: %s [b'\x89F\x98\x12\xf5\xd0\x89j\xce\xb6\xdf\x16\x05\x0c\xc3\xd53\xbb\x00\x01|\x00\x00\x00\x01\x00\x02\x00\x03\x00\x04\x00\x05\x00\x06\x00\x07\x00\x08\x00\t\x00\n\x00\x0b\x00\x0c\x00\r\x00\x0e\x00\x0f\x00\x10\x00\x11\x00\x12\x00\x13\x00\x14\x00\x15\x00\x16\x00\x17\x00\x18\x00\x19\x00\x1a\x00\x1b\x00/\x000\x001\x002\x003\x004\x005\x006\x007\x008\x009\x00:\x00;\x00<\x00=\x00>\x00?\x00@\x00A\x00B\x00C\x00D\x00E\x00F\x00g\x00h\x00i\x00j\x00k\x00l\x00m\x00\x84\x00\x85\x00\x86\x00\x87\x00\x88\x00\x89\x00\x96\x00\x97\x00\x98\x00\x99']
Bad pipe message: %s [b"'/\x19\xebM\xa3\x843&\xc3\xeb\xa9\xb3\xcaa\x17K\xf4\x00\x01|\x00\x00\x00\

In [None]:
# activity_df, assets_df, country_df, customer_df, location_df, service_req_df = dfs
# print(final_data.columns)

# # save CSVs for neo4j import:
# neo4j_dir_path = './neo4j_data'
# Path(neo4j_dir_path).mkdir(parents=True, exist_ok=True)

# try:
#     activity_df.to_csv(f"{neo4j_dir_path}/activities.csv",index=False)
#     assets_df.to_csv(f"{neo4j_dir_path}/assets.csv",index=False)
#     country_df.to_csv(f"{neo4j_dir_path}/countries.csv",index=False)
#     customer_df.to_csv(f"{neo4j_dir_path}/customers.csv",index=False)
#     location_df.to_csv(f"{neo4j_dir_path}/location.csv",index=False)
#     service_req_df.to_csv(f"{neo4j_dir_path}/service_requests.csv",index=False)

# except NameError:
#     print("CSVs not found!")
