# MC1 to Neo4j Uploader (Notebook)

This notebook uploads the MC1 knowledge-graph JSON (nodes + links) to your Neo4j cloud database using the modern driver (`GraphDatabase.driver` + `execute_query`).

- Reads: `mc1.json` from the project root
- Writes: Nodes with labels derived from `node.type` (last segment), relationships with type derived from `link.type` (sanitized)
- Preserves link properties like `_articleid`, `_date_added`, `_raw_source`, `_algorithm`, `_last_edited_by`, `_last_edited_date`, and `key`
- Uses batching + retries for robustness

Run the cells from top to bottom. 


In [4]:
# Install dependencies (only if needed)
# If running in a fresh environment, uncomment the next lines
# !pip install neo4j tqdm

import os
import json
from typing import Dict, Any, List
from dataclasses import dataclass
from time import sleep

from neo4j import GraphDatabase
from tqdm import tqdm

# Hardcoded Neo4j connection (no .env)
NEO4J_URI = 'neo4j+s://397603d1.databases.neo4j.io'
NEO4J_USER = 'neo4j'
NEO4J_PASSWORD = 'dQMr8EoOUknWQK7JNRD5Kd4UtXPlBoXuURrezQ38Tz8'

# Data file
MC1_PATH = 'mc1.json'
MAX_RETRIES = 3

print('Config:')
print('  URI:', NEO4J_URI)
print('  USER:', NEO4J_USER)
print('  MC1_PATH:', os.path.abspath(MC1_PATH))


Config:
  URI: neo4j+s://397603d1.databases.neo4j.io
  USER: neo4j
  MC1_PATH: c:\Users\Rajkumar Dake\OneDrive\Desktop\projects\veda-project\mc1.json


In [5]:
def sanitize_label(label: str) -> str:
    return (label or 'Unknown').replace('.', '_').replace('-', '_').replace(' ', '_')


def label_from_type(node_type: str) -> str:
    if not node_type:
        return 'Unknown'
    return sanitize_label(node_type.split('.')[-1])


def read_mc1(path: str) -> Dict[str, Any]:
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)


import socket

def resolve_host(uri: str):
    try:
        host = uri.split('://', 1)[1].split(':', 1)[0]
        socket.getaddrinfo(host, 7687)
        return True
    except Exception as e:
        print(f"‚ö†Ô∏è DNS resolution failed for host in URI ({uri}): {e}")
        return False


def connect(uri: str, user: str, password: str):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    driver.verify_connectivity()
    return driver


def connect_with_retries(uri: str, user: str, password: str, retries: int = 5, delay_seconds: float = 2.0):
    for attempt in range(1, retries + 1):
        try:
            if not resolve_host(uri):
                raise RuntimeError("DNS resolution failed")
            driver = GraphDatabase.driver(uri, auth=(user, password))
            driver.verify_connectivity()
            print(f"‚úÖ Connected on attempt {attempt}")
            return driver
        except Exception as e:
            print(f"Retry {attempt}/{retries} - connect failed: {e}")
            if attempt == retries:
                raise
            sleep(delay_seconds)


def clear_db(driver) -> None:
    driver.execute_query('MATCH (n) DETACH DELETE n')


def create_nodes(driver, nodes: List[Dict[str, Any]]) -> None:
    for node in tqdm(nodes, desc='Nodes'):
        node_id = node.get('id')
        node_type = node.get('type', 'Unknown')
        label = label_from_type(node_type)
        properties = {k: v for k, v in node.items() if k != 'id'}
        properties['id'] = node_id
        properties['type'] = node_type
        q = f"MERGE (n:{label} {{id: $id}}) SET n += $properties"
        driver.execute_query(q, id=node_id, properties=properties)


def create_relationships(driver, links: List[Dict[str, Any]], max_retries: int = 3) -> None:
    for link in tqdm(links, desc='Relationships'):
        source_id = link.get('source')
        target_id = link.get('target')
        rel_type = sanitize_label(link.get('type', 'RELATED'))
        properties = {k: v for k, v in link.items() if k not in ['source', 'target', 'type']}
        if not source_id or not target_id:
            continue
        q = f"""
        MATCH (a {{id: $source_id}}), (b {{id: $target_id}})
        MERGE (a)-[r:{rel_type}]->(b)
        SET r += $properties
        """
        for attempt in range(max_retries):
            try:
                driver.execute_query(q, source_id=source_id, target_id=target_id, properties=properties)
                break
            except Exception as e:
                if attempt == max_retries - 1:
                    raise
                sleep(1)


In [None]:
# Connect, load, and upload

driver = connect_with_retries(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD, retries=8, delay_seconds=2.0)
print('‚úÖ Connected to Neo4j')

data = read_mc1(MC1_PATH)
nodes = data.get('nodes', [])
links = data.get('links', [])
print(f"Loaded {len(nodes)} nodes and {len(links)} links from {MC1_PATH}")

# Optional: wipe database first
clear_db(driver)
print('üóëÔ∏è  Cleared existing data')

# Create nodes
create_nodes(driver, nodes)
print('‚úÖ Nodes uploaded')

# Create relationships
create_relationships(driver, links, max_retries=MAX_RETRIES)
print('‚úÖ Relationships uploaded')

# Quick stats
records, _, _ = driver.execute_query("MATCH (n) RETURN count(n) AS c")
node_count = records[0]['c'] if records else 0
records, _, _ = driver.execute_query("MATCH ()-[r]->() RETURN count(r) AS c")
rel_count = records[0]['c'] if records else 0
print(f"üìä Done: {node_count} nodes, {rel_count} relationships")


‚úÖ Connected on attempt 1
‚úÖ Connected to Neo4j
Loaded 215 nodes and 16231 links from mc1.json
üóëÔ∏è  Cleared existing data


Nodes: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 215/215 [00:38<00:00,  5.60it/s]


‚úÖ Nodes uploaded


Relationships:  17%|‚ñà‚ñã        | 2827/16231 [08:39<43:23,  5.15it/s]  